In [1]:
#!pip install yellowbrick
#!pip install squarify
#!pip install function_utils

Import des librairies

In [89]:
# Import des librairies

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.set_option('display.max_columns', 100)

import missingno as msno
import plotly.graph_objects as go
import plotly.express as px

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn import metrics

import category_encoders as ce

from plotly.subplots import make_subplots

import datetime
from sklearn.decomposition import PCA

from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from yellowbrick.cluster import KElbowVisualizer
from yellowbrick.cluster import intercluster_distance
from yellowbrick.cluster import SilhouetteVisualizer

import squarify

Import dataset

In [3]:
df_olist =pd.read_csv('Dataset/olist_final_cleaned.csv',index_col=0,parse_dates=['order_purchase_timestamp','order_approved_at','order_delivered_carrier_date','order_delivered_customer_date','order_estimated_delivery_date','shipping_limit_date'])
df_olist.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 94017 entries, 0 to 99440
Data columns (total 27 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   customer_id                    94017 non-null  object        
 1   customer_unique_id             94017 non-null  object        
 2   customer_zip_code_prefix       94017 non-null  int64         
 3   customer_city                  94017 non-null  object        
 4   customer_state                 94017 non-null  object        
 5   geolocation_lat                94017 non-null  float64       
 6   geolocation_lng                94017 non-null  float64       
 7   order_id                       94017 non-null  object        
 8   order_status                   94017 non-null  object        
 9   order_purchase_timestamp       94017 non-null  datetime64[ns]
 10  order_approved_at              94017 non-null  datetime64[ns]
 11  order_delivered_carrier_date   94017 non-null  datetime64[ns]
 12  order_delivered_customer_date  94017 non-null  datetime64[ns]
 13  order_estimated_delivery_date  94017 non-null  datetime64[ns]
 14  payment_sequential             94017 non-null  float64       
 15  payment_type                   94017 non-null  object        
 16  payment_installments           94017 non-null  float64       
 17  payment_value                  94017 non-null  float64       
 18  review_score                   94017 non-null  int64         
 19  order_item_id                  94017 non-null  float64       
 20  product_id                     94017 non-null  object        
 21  shipping_limit_date            94017 non-null  datetime64[ns]
 22  price                          94017 non-null  float64       
 23  freight_value                  94017 non-null  float64       
 24  product_category_name          94017 non-null  object        
 25  seller_city                    94017 non-null  object        
 26  seller_state                   94017 non-null  object        
dtypes: datetime64[ns](6), float64(8), int64(2), object(11)
memory usage: 20.1+ MB
In [4]:
df_olist.head()
Out[4]:
customer_id customer_unique_id customer_zip_code_prefix customer_city customer_state geolocation_lat geolocation_lng order_id order_status order_purchase_timestamp order_approved_at order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date payment_sequential payment_type payment_installments payment_value review_score order_item_id product_id shipping_limit_date price freight_value product_category_name seller_city seller_state
0 06b8999e2fba1a1fbc88172c00ba8bc7 861eff4711a542e4b93843c6dd7febb0 14409 franca SP -20.498489 -47.396929 00e7ee1b050b8499577073aeb2a297a1 delivered 2017-05-16 15:05:35 2017-05-16 15:22:12 2017-05-23 10:47:57 2017-05-25 10:35:35 2017-06-05 1.0 credit_card 2.0 146.87 4 1.0 a9516a079e37a9c9c36b9b78b10169e8 2017-05-22 15:22:12 124.99 21.88 Home Decor itaquaquecetuba SP
1 18955e83d337fd6b2def6b18a428ac77 290c77bc529b7ac935b93aa66c333dc3 9790 sao bernardo do campo SP -23.727992 -46.542848 29150127e6685892b6eab3eec79f59c7 delivered 2018-01-12 20:48:24 2018-01-12 20:58:32 2018-01-15 17:14:59 2018-01-29 12:41:19 2018-02-06 1.0 credit_card 8.0 335.48 5 1.0 4aa6014eceb682077f9dc4bffebc05b0 2018-01-18 20:58:32 289.00 46.48 Home Decor itajai SC
2 4e7b3e00288586ebd08712fdd0374a03 060e732b5b29e8181a18229c7b0b2b5e 1151 sao paulo SP -23.531642 -46.656289 b2059ed67ce144a36e2aa97d2c9e9ad2 delivered 2018-05-19 16:07:45 2018-05-20 16:19:10 2018-06-11 14:31:00 2018-06-14 17:58:51 2018-06-13 1.0 credit_card 7.0 157.73 5 1.0 bd07b66896d6f1494f5b86251848ced7 2018-06-05 16:19:10 139.94 17.79 Home Decor itaquaquecetuba SP
3 b2b6027bc5c5109e529d4dc6358b12c3 259dac757896d24d7702b9acbbff3f3c 8775 mogi das cruzes SP -23.499702 -46.185233 951670f92359f4fe4a63112aa7306eba delivered 2018-03-13 16:06:38 2018-03-13 17:29:19 2018-03-27 23:22:42 2018-03-28 16:04:25 2018-04-10 1.0 credit_card 1.0 173.30 5 1.0 a5647c44af977b148e0a3a4751a09e2e 2018-03-27 16:31:16 149.94 23.36 Home Decor itaquaquecetuba SP
4 4f2d8ab171c80ec8364f7c12e35b23ad 345ecd01c38d18a9036ed96c73b8d066 13056 campinas SP -22.975100 -47.142925 6b7d50bd145f6fc7f33cebabd7e49d0f delivered 2018-07-29 09:51:30 2018-07-29 10:10:09 2018-07-30 15:16:00 2018-08-09 20:55:48 2018-08-15 1.0 credit_card 8.0 252.25 5 1.0 9391a573abe00141c56e38d84d7d5b3b 2018-07-31 10:10:09 230.00 22.25 Home Decor ibitinga SP
In [5]:
df_olist.shape
Out[5]:
(94017, 27)

Création d'une segmentation RFM

  • Récence : le nombre de jours depuis la dernière commande
  • Fréquence : le nombre de commande passées sur la période étudiée
  • Valeur Monétaire : totale du prix des commandes sur la période étudiée
In [6]:
max_date = max(df_olist['order_purchase_timestamp']) + datetime.timedelta(days=1)
In [7]:
rfm_df = df_olist.groupby('customer_unique_id').agg({
    'order_purchase_timestamp': lambda x: (max_date - x.max()).days,
    'order_id':'count',
    'price':'sum'
}).reset_index()
rfm_df.columns = ['customer_unique_id','recency','frequency','monetary']
In [8]:
#snapshot_date = max(df_olist.order_approved_at) + datetime.timedelta(days=1)
In [9]:
#rfm = df_olist.groupby("customer_unique_id").agg({
#    "order_approved_at" : lambda x: (snapshot_date - x.max()).days,
#    "order_id" : 'count',
#    "payment_value" : "mean"
#})

#rfm.columns = ["Recency", "Frequency", "MonetaryValue"]
In [10]:
rfm_df
Out[10]:
customer_unique_id recency frequency monetary
0 0000366f3b9a7992bf8c76cfdf3221e2 112 1 129.90
1 0000b849f77a49e4a4ce2b2a4ca5be3f 115 1 18.90
2 0000f46a3911fa3c0805444483337064 537 1 69.00
3 0000f6ccb0745a6a4b88665a16c9f078 321 1 25.99
4 0004aac84e0df4da2b147fca70cf8255 288 1 180.00
... ... ... ... ...
91021 fffcf5a5ff07b0908bd4e2dbc735a684 447 1 680.00
91022 fffea47cd6d3cc0a88bd621562a9d061 262 1 64.89
91023 ffff371b4d645b6ecea244b27531430a 568 1 89.90
91024 ffff5962728ec6157033ef9805bacc48 119 1 115.00
91025 ffffd2657e2aad2907e67c3e9daecbeb 484 1 56.99

91026 rows × 4 columns

In [11]:
rfm_df[rfm_df['frequency'] > 1].count()*100 / rfm_df.shape[0]
Out[11]:
customer_unique_id    2.958495
recency               2.958495
frequency             2.958495
monetary              2.958495
dtype: float64
In [12]:
rfm_df.shape
Out[12]:
(91026, 4)
In [13]:
sns.countplot(y="frequency", data=rfm_df)
plt.title("Répartition de la fréquence d'achat")

plt.show()

La plupart des clients réalise un seul achat

In [14]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5), sharex=True)
fig.suptitle('Description des montants dépensés par les clients')

sns.histplot(ax=axes[0], x="monetary", data=rfm_df, kde=True)
axes[0].set_title("Distribution des montants dépensés")

sns.boxplot(ax=axes[1], x="monetary", data=rfm_df)
axes[1].set_title("Boxplot des montants dépensés")

plt.show()

Les sommes dépensées atteignent un maximum de 14000.

In [15]:
fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharex=True)
fig.suptitle('Distribution des variables')

sns.histplot(ax=axes[0], x="recency", data=rfm_df, kde=True)
axes[0].set_title("Distribution des jours passés depuis la dernière commande")

sns.histplot(ax=axes[1], x="frequency", data=rfm_df, kde=True)
axes[1].set_title("Distribution de la fréquence d'achats des clients")

sns.histplot(ax=axes[2], x="monetary", data=rfm_df, kde=True)
axes[2].set_title("Distribution des sommes dépensées par les clients")

plt.tight_layout()
plt.show()
In [16]:
# Plot RFM distributions
plt.figure(figsize=(15,5))

plt.subplot(1, 3, 1); sns.histplot(rfm_df['recency'])
plt.subplot(1, 3, 2); sns.histplot(rfm_df['frequency'])
plt.subplot(1, 3, 3); sns.histplot(rfm_df['monetary'])
# Show the plot
plt.show()
In [17]:
rfm_df['frequency'].unique()
Out[17]:
array([ 1,  2,  3,  4,  6,  7,  5,  9, 14])

Pour la segmentation de notre clientèle à l'aide de K-Means, il sera très important de nous assurer que nous mettons à l'échelle nos données pour centrer la moyenne et les écarts-types. Continuons avec le .qcut() pour notre RFM.

In [18]:
# --Calculate R and F groups--
# Create labels for Recency and Frequency
r_labels = range(4, 0, -1); f_labels = range(1, 5)
# Assign these labels to 4 equal percentile groups 
r_groups = pd.qcut(rfm_df['recency'], q=4, labels=r_labels)
# Assign these labels to 4 equal percentile groups 
f_groups = pd.qcut(rfm_df['frequency'].rank(method='first'), q=4, labels=f_labels)
# Create new columns R and F 
data_process = rfm_df.assign(R = r_groups.values, F = f_groups.values)
data_process.head()
Out[18]:
customer_unique_id recency frequency monetary R F
0 0000366f3b9a7992bf8c76cfdf3221e2 112 1 129.90 4 1
1 0000b849f77a49e4a4ce2b2a4ca5be3f 115 1 18.90 3 1
2 0000f46a3911fa3c0805444483337064 537 1 69.00 1 1
3 0000f6ccb0745a6a4b88665a16c9f078 321 1 25.99 2 1
4 0004aac84e0df4da2b147fca70cf8255 288 1 180.00 2 1
In [19]:
sns.boxplot(x=data_process['recency'])
Out[19]:
<AxesSubplot:xlabel='recency'>
In [20]:
sns.boxplot(x=data_process['frequency'])
Out[20]:
<AxesSubplot:xlabel='frequency'>
In [21]:
sns.boxplot(x=data_process['monetary'])
Out[21]:
<AxesSubplot:xlabel='monetary'>
In [22]:
# Create labels for MonetaryValue
m_labels = range(1, 5)
# Assign these labels to three equal percentile groups 
m_groups = pd.qcut(data_process['monetary'], q=4, labels=m_labels)
# Create new column M
data_process = data_process.assign(M = m_groups.values)
In [23]:
# Concat RFM quartile values to create RFM Segments
def join_rfm(x): return str(x['R']) + str(x['F']) + str(x['M'])
data_process['RFM_Segment_Concat'] = data_process.apply(join_rfm, axis=1)
rfm = data_process
rfm.head()
Out[23]:
customer_unique_id recency frequency monetary R F M RFM_Segment_Concat
0 0000366f3b9a7992bf8c76cfdf3221e2 112 1 129.90 4 1 3 413
1 0000b849f77a49e4a4ce2b2a4ca5be3f 115 1 18.90 3 1 1 311
2 0000f46a3911fa3c0805444483337064 537 1 69.00 1 1 2 112
3 0000f6ccb0745a6a4b88665a16c9f078 321 1 25.99 2 1 1 211
4 0004aac84e0df4da2b147fca70cf8255 288 1 180.00 2 1 4 214
In [24]:
# Count num of unique segments
rfm_count_unique = rfm.groupby('RFM_Segment_Concat')['RFM_Segment_Concat'].nunique()
print(rfm_count_unique.sum())
64
In [25]:
# Calculate RFM_Score
rfm['RFM_Score'] = rfm[['R','F','M']].sum(axis=1)
print(rfm['RFM_Score'].head())
0    8
1    5
2    4
3    4
4    7
Name: RFM_Score, dtype: int64
In [26]:
rfm
Out[26]:
customer_unique_id recency frequency monetary R F M RFM_Segment_Concat RFM_Score
0 0000366f3b9a7992bf8c76cfdf3221e2 112 1 129.90 4 1 3 413 8
1 0000b849f77a49e4a4ce2b2a4ca5be3f 115 1 18.90 3 1 1 311 5
2 0000f46a3911fa3c0805444483337064 537 1 69.00 1 1 2 112 4
3 0000f6ccb0745a6a4b88665a16c9f078 321 1 25.99 2 1 1 211 4
4 0004aac84e0df4da2b147fca70cf8255 288 1 180.00 2 1 4 214 7
... ... ... ... ... ... ... ... ... ...
91021 fffcf5a5ff07b0908bd4e2dbc735a684 447 1 680.00 1 4 4 144 9
91022 fffea47cd6d3cc0a88bd621562a9d061 262 1 64.89 2 4 2 242 8
91023 ffff371b4d645b6ecea244b27531430a 568 1 89.90 1 4 3 143 8
91024 ffff5962728ec6157033ef9805bacc48 119 1 115.00 3 4 3 343 10
91025 ffffd2657e2aad2907e67c3e9daecbeb 484 1 56.99 1 4 2 142 7

91026 rows × 9 columns

Analyzing RFM Segmentation Let’s delve into few interesting segments:

Champions are your best customers, who bought most recently, most often, and are heavy spenders. Reward these customers. They can become early adopters for new products and will help promote your brand.

Potential Loyalists are your recent customers with average frequency and who spent a good amount. Offer membership or loyalty programs or recommend related products to upsell them and help them become your Loyalists or Champions.

New Customers are your customers who have a high overall RFM score but are not frequent shoppers. Start building relationships with these customers by providing onboarding support and special offers to increase their visits.

At Risk Customers are your customers who purchased often and spent big amounts, but haven’t purchased recently. Send them personalized reactivation campaigns to reconnect, and offer renewals and helpful products to encourage another purchase.

Can’t Lose Them are customers who used to visit and purchase quite often, but haven’t been visiting recently. Bring them back with relevant promotions, and run surveys to find out what went wrong and avoid losing them to a competitor.

In [27]:
# Define rfm_level function
def rfm_level(df):
    if df['RFM_Score'] >= 9:
        return 'Can\'t Lose Them'
    elif ((df['RFM_Score'] >= 8) and (df['RFM_Score'] < 9)):
        return 'Champions'
    elif ((df['RFM_Score'] >= 7) and (df['RFM_Score'] < 8)):
        return 'Loyal'
    elif ((df['RFM_Score'] >= 6) and (df['RFM_Score'] < 7)):
        return 'Potential'
    elif ((df['RFM_Score'] >= 5) and (df['RFM_Score'] < 6)):
        return 'Promising'
    elif ((df['RFM_Score'] >= 4) and (df['RFM_Score'] < 5)):
        return 'Needs Attention'
    else:
        return 'Require Activation'
# Create a new variable RFM_Level
rfm['RFM_Level'] = rfm.apply(rfm_level, axis=1)
# Print the header with top 5 rows to the console
rfm.head()
Out[27]:
customer_unique_id recency frequency monetary R F M RFM_Segment_Concat RFM_Score RFM_Level
0 0000366f3b9a7992bf8c76cfdf3221e2 112 1 129.90 4 1 3 413 8 Champions
1 0000b849f77a49e4a4ce2b2a4ca5be3f 115 1 18.90 3 1 1 311 5 Promising
2 0000f46a3911fa3c0805444483337064 537 1 69.00 1 1 2 112 4 Needs Attention
3 0000f6ccb0745a6a4b88665a16c9f078 321 1 25.99 2 1 1 211 4 Needs Attention
4 0004aac84e0df4da2b147fca70cf8255 288 1 180.00 2 1 4 214 7 Loyal
In [29]:
# Calculate average values for each RFM_Level, and return a size of each segment 
rfm_level_agg = rfm.groupby('RFM_Level').agg({
    'recency': 'mean',
    'frequency': 'mean',
    'monetary': ['mean', 'count']
}).round(1)
# Print the aggregated dataset
print(rfm_level_agg)
                   recency frequency monetary       
                      mean      mean     mean  count
RFM_Level                                           
Can't Lose Them      143.3       1.1    205.9  28553
Champions            212.2       1.0    134.2  16654
Loyal                252.9       1.0    106.8  17130
Needs Attention      395.4       1.0     37.8   4451
Potential            299.3       1.0     82.4  14053
Promising            346.6       1.0     51.8   8736
Require Activation   456.1       1.0     26.7   1449

À partir de là, nous pouvons voir qu'un grand pourcentage de nos clients se situent dans les niveaux supérieurs de RFM. Le magasin doit faire quelque chose de bien pour maintenir sa fidélité !

Potential - potentiel élevé pour entrer dans nos segments de clients fidèles

Promising — montre des signes prometteurs avec la quantité et la valeur de leur achat, mais cela fait un moment qu'ils n'ont pas acheté.

Needs Attention — a fait un achat initial mais ne l'a pas vu depuis.

Require Activation — Les moins performants de notre modèle RFM.

In [30]:
rfm_level_agg
Out[30]:
recency frequency monetary
mean mean mean count
RFM_Level
Can't Lose Them 143.3 1.1 205.9 28553
Champions 212.2 1.0 134.2 16654
Loyal 252.9 1.0 106.8 17130
Needs Attention 395.4 1.0 37.8 4451
Potential 299.3 1.0 82.4 14053
Promising 346.6 1.0 51.8 8736
Require Activation 456.1 1.0 26.7 1449
In [31]:
rfm_level_agg.columns = rfm_level_agg.columns.droplevel()
rfm_level_agg.columns = ['RecencyMean','FrequencyMean','MonetaryMean', 'Count']
In [32]:
rfm_level_agg = rfm_level_agg.reset_index()
In [33]:
#Create our plot and resize it.
fig = plt.gcf()
ax = fig.add_subplot()
#fig.set_size_inches(16, 9)
squarify.plot(sizes=rfm_level_agg['Count'], 
              label=rfm_level_agg['RFM_Level'],  color=["red","cyan","blue", "grey", "navy","yellow","green"], alpha=.6 )
plt.title("RFM Segments",fontsize=18,fontweight="bold")
plt.show()
In [34]:
fig = px.scatter_3d(rfm, x='recency', 
                   y='frequency', z='monetary', 
                   color='RFM_Level')
fig.show()
In [35]:
pie = rfm.RFM_Level.value_counts()

plt.figure(figsize=(12, 8))
plt.pie(pie, labels= pie.index, autopct='%1.1f%%')
plt.title('repartion des classes RFM')
plt.show() 

On va concatener les 2 datasets

In [36]:
df_merged = df_olist.merge(rfm, how="left", on="customer_unique_id")
In [37]:
df_merged.boxplot('payment_value', by='RFM_Level',  figsize=(12, 8))
plt.xticks(rotation=90)
plt.title('prix commande par classe RFM')
plt.show()
In [38]:
df_merged.head()
Out[38]:
customer_id customer_unique_id customer_zip_code_prefix customer_city customer_state geolocation_lat geolocation_lng order_id order_status order_purchase_timestamp order_approved_at order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date payment_sequential payment_type payment_installments payment_value review_score order_item_id product_id shipping_limit_date price freight_value product_category_name seller_city seller_state recency frequency monetary R F M RFM_Segment_Concat RFM_Score RFM_Level
0 06b8999e2fba1a1fbc88172c00ba8bc7 861eff4711a542e4b93843c6dd7febb0 14409 franca SP -20.498489 -47.396929 00e7ee1b050b8499577073aeb2a297a1 delivered 2017-05-16 15:05:35 2017-05-16 15:22:12 2017-05-23 10:47:57 2017-05-25 10:35:35 2017-06-05 1.0 credit_card 2.0 146.87 4 1.0 a9516a079e37a9c9c36b9b78b10169e8 2017-05-22 15:22:12 124.99 21.88 Home Decor itaquaquecetuba SP 470 1 124.99 1 3 3 133 7 Loyal
1 18955e83d337fd6b2def6b18a428ac77 290c77bc529b7ac935b93aa66c333dc3 9790 sao bernardo do campo SP -23.727992 -46.542848 29150127e6685892b6eab3eec79f59c7 delivered 2018-01-12 20:48:24 2018-01-12 20:58:32 2018-01-15 17:14:59 2018-01-29 12:41:19 2018-02-06 1.0 credit_card 8.0 335.48 5 1.0 4aa6014eceb682077f9dc4bffebc05b0 2018-01-18 20:58:32 289.00 46.48 Home Decor itajai SC 229 1 289.00 2 1 4 214 7 Loyal
2 4e7b3e00288586ebd08712fdd0374a03 060e732b5b29e8181a18229c7b0b2b5e 1151 sao paulo SP -23.531642 -46.656289 b2059ed67ce144a36e2aa97d2c9e9ad2 delivered 2018-05-19 16:07:45 2018-05-20 16:19:10 2018-06-11 14:31:00 2018-06-14 17:58:51 2018-06-13 1.0 credit_card 7.0 157.73 5 1.0 bd07b66896d6f1494f5b86251848ced7 2018-06-05 16:19:10 139.94 17.79 Home Decor itaquaquecetuba SP 102 1 139.94 4 1 3 413 8 Champions
3 b2b6027bc5c5109e529d4dc6358b12c3 259dac757896d24d7702b9acbbff3f3c 8775 mogi das cruzes SP -23.499702 -46.185233 951670f92359f4fe4a63112aa7306eba delivered 2018-03-13 16:06:38 2018-03-13 17:29:19 2018-03-27 23:22:42 2018-03-28 16:04:25 2018-04-10 1.0 credit_card 1.0 173.30 5 1.0 a5647c44af977b148e0a3a4751a09e2e 2018-03-27 16:31:16 149.94 23.36 Home Decor itaquaquecetuba SP 169 1 149.94 3 1 4 314 8 Champions
4 4f2d8ab171c80ec8364f7c12e35b23ad 345ecd01c38d18a9036ed96c73b8d066 13056 campinas SP -22.975100 -47.142925 6b7d50bd145f6fc7f33cebabd7e49d0f delivered 2018-07-29 09:51:30 2018-07-29 10:10:09 2018-07-30 15:16:00 2018-08-09 20:55:48 2018-08-15 1.0 credit_card 8.0 252.25 5 1.0 9391a573abe00141c56e38d84d7d5b3b 2018-07-31 10:10:09 230.00 22.25 Home Decor ibitinga SP 32 1 230.00 4 1 4 414 9 Can't Lose Them
In [39]:
fig = px.scatter(df_merged, x=df_merged.recency, 
                 y=df_merged.frequency, 
                 color=df_merged.customer_state, 
                 size=df_merged.monetary,
                title="Distribution of Frequency and Recency of purchase as per Customer State<br><sup>Size of Circles represent the purchase monetary</sup>")
fig.show()
In [40]:
fig = px.scatter(df_merged, x=df_merged.monetary, 
                 y=df_merged.frequency, 
                 color=df_merged.customer_state, 
                 size=df_merged.recency,
                title="Distribution of Frequency and Moneytary purchase as per Customer State<br><sup>Size of Circles represent the recency of purchase</sup>")
fig.show()

On va enlever les outliers de par rapport au monetary

In [41]:
#On supprime les valeurs aberrantes
def remove_outlier_iqr(df_in, column):
    '''function that gives IQR,lower and upper whisker'''

    #On calcule Q1 et Q3
    q1, q3 = np.quantile(df_in[column], 0.25), np.quantile(df_in[column], 0.95)
    #On calcule l'écart interquartile (IQR)
    IQR = q3 - q1
    #On calcule la borne inférieure à l'aide du Q1 et de l'écart interquartile
    borne_inf = q1-1.5*IQR
    #On calcule la borne supérieure à l'aide du Q3 et de l'écart interquartile
    borne_sup = q3 +1.5*IQR

    #On calcul les valeurs à l'intérieur de la borne inférieure et supérieure
    df1 = df_in[df_in[column] > borne_sup]
    df2 = df_in[df_in[column] < borne_inf]
    
    ''' Removing the Outliers '''
    
    
    df_out = df_in[~((df_in[column] < borne_inf) |(df_in[column] > borne_sup))]

    
    print('Total number of outliers are ', df1.shape[0]+ df2.shape[0])

    return df_out
In [42]:
df_stats = remove_outlier_iqr(df_merged, "monetary")
Total number of outliers are  992
In [43]:
df_stats = remove_outlier_iqr(df_merged, "frequency")
Total number of outliers are  219
In [44]:
df_stats.shape
Out[44]:
(93798, 36)
In [45]:
sns.boxplot(x=df_stats['monetary'])
Out[45]:
<AxesSubplot:xlabel='monetary'>
In [46]:
sns.boxplot(x=df_stats['frequency'])
Out[46]:
<AxesSubplot:xlabel='frequency'>
In [47]:
df_stats.head()
Out[47]:
customer_id customer_unique_id customer_zip_code_prefix customer_city customer_state geolocation_lat geolocation_lng order_id order_status order_purchase_timestamp order_approved_at order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date payment_sequential payment_type payment_installments payment_value review_score order_item_id product_id shipping_limit_date price freight_value product_category_name seller_city seller_state recency frequency monetary R F M RFM_Segment_Concat RFM_Score RFM_Level
0 06b8999e2fba1a1fbc88172c00ba8bc7 861eff4711a542e4b93843c6dd7febb0 14409 franca SP -20.498489 -47.396929 00e7ee1b050b8499577073aeb2a297a1 delivered 2017-05-16 15:05:35 2017-05-16 15:22:12 2017-05-23 10:47:57 2017-05-25 10:35:35 2017-06-05 1.0 credit_card 2.0 146.87 4 1.0 a9516a079e37a9c9c36b9b78b10169e8 2017-05-22 15:22:12 124.99 21.88 Home Decor itaquaquecetuba SP 470 1 124.99 1 3 3 133 7 Loyal
1 18955e83d337fd6b2def6b18a428ac77 290c77bc529b7ac935b93aa66c333dc3 9790 sao bernardo do campo SP -23.727992 -46.542848 29150127e6685892b6eab3eec79f59c7 delivered 2018-01-12 20:48:24 2018-01-12 20:58:32 2018-01-15 17:14:59 2018-01-29 12:41:19 2018-02-06 1.0 credit_card 8.0 335.48 5 1.0 4aa6014eceb682077f9dc4bffebc05b0 2018-01-18 20:58:32 289.00 46.48 Home Decor itajai SC 229 1 289.00 2 1 4 214 7 Loyal
2 4e7b3e00288586ebd08712fdd0374a03 060e732b5b29e8181a18229c7b0b2b5e 1151 sao paulo SP -23.531642 -46.656289 b2059ed67ce144a36e2aa97d2c9e9ad2 delivered 2018-05-19 16:07:45 2018-05-20 16:19:10 2018-06-11 14:31:00 2018-06-14 17:58:51 2018-06-13 1.0 credit_card 7.0 157.73 5 1.0 bd07b66896d6f1494f5b86251848ced7 2018-06-05 16:19:10 139.94 17.79 Home Decor itaquaquecetuba SP 102 1 139.94 4 1 3 413 8 Champions
3 b2b6027bc5c5109e529d4dc6358b12c3 259dac757896d24d7702b9acbbff3f3c 8775 mogi das cruzes SP -23.499702 -46.185233 951670f92359f4fe4a63112aa7306eba delivered 2018-03-13 16:06:38 2018-03-13 17:29:19 2018-03-27 23:22:42 2018-03-28 16:04:25 2018-04-10 1.0 credit_card 1.0 173.30 5 1.0 a5647c44af977b148e0a3a4751a09e2e 2018-03-27 16:31:16 149.94 23.36 Home Decor itaquaquecetuba SP 169 1 149.94 3 1 4 314 8 Champions
4 4f2d8ab171c80ec8364f7c12e35b23ad 345ecd01c38d18a9036ed96c73b8d066 13056 campinas SP -22.975100 -47.142925 6b7d50bd145f6fc7f33cebabd7e49d0f delivered 2018-07-29 09:51:30 2018-07-29 10:10:09 2018-07-30 15:16:00 2018-08-09 20:55:48 2018-08-15 1.0 credit_card 8.0 252.25 5 1.0 9391a573abe00141c56e38d84d7d5b3b 2018-07-31 10:10:09 230.00 22.25 Home Decor ibitinga SP 32 1 230.00 4 1 4 414 9 Can't Lose Them
In [48]:
df_stats.to_csv("Dataset/olist_df_stats.csv")
In [49]:
recency = np.log(df_stats['recency'])
frequency = np.log(df_stats["frequency"])
monetary = np.log(df_stats["monetary"])
In [50]:
df = df_stats.copy()
In [51]:
df.loc[:,"recency"] = recency
df.loc[:,"frequency"] = frequency
df.loc[:,"monetary"] = monetary
df.reset_index(inplace=True)
In [52]:
fig, axes = plt.subplots(1, 3, figsize=(15, 6), sharex=True)
fig.suptitle('Distribution des variables après transformation logarithme')

sns.histplot(ax=axes[0], x="recency", data=df, kde=True)
axes[0].set_title("Distribution des jours passés depuis la dernière commande")

sns.histplot(ax=axes[1], x="frequency", data=df, kde=True)
axes[1].set_title("Distribution de la fréquence d'achats des clients")

sns.histplot(ax=axes[2], x="monetary", data=df, kde=True)
axes[2].set_title("Distribution des sommes dépensées par les clients")

plt.tight_layout()
plt.show()
On va ajouter d'autres variables pour la segmentation. On va ajouter le nombre de jours entre la commande et la réception de cette dernière, l'avis du client et la catégorie de produits commandée.
In [53]:
df["nb_days_to_deliver"] = df["order_delivered_customer_date"] - df["order_approved_at"]
df["nb_days_estimated_to_deliver"] = df["order_estimated_delivery_date"] - df["order_approved_at"]

add_cols = ["nb_days_to_deliver","nb_days_estimated_to_deliver"]

for cols in add_cols:
    df[cols] = df[cols].dt.days

df.drop(["order_approved_at",
         "order_delivered_customer_date",
         "order_estimated_delivery_date"], axis=1, inplace=True)
In [54]:
df[df["nb_days_to_deliver"] < 0]
Out[54]:
index customer_id customer_unique_id customer_zip_code_prefix customer_city customer_state geolocation_lat geolocation_lng order_id order_status order_purchase_timestamp order_delivered_carrier_date payment_sequential payment_type payment_installments payment_value review_score order_item_id product_id shipping_limit_date price freight_value product_category_name seller_city seller_state recency frequency monetary R F M RFM_Segment_Concat RFM_Score RFM_Level nb_days_to_deliver nb_days_estimated_to_deliver
1573 1578 caed95700dcb091eeaac59a97fe0cb17 587d22544f4d6f17b39a30723a53a305 8557 poa SP -23.512737 -46.342315 82ffe097d8ddbf319a523b9bbe7725d5 delivered 2018-04-23 01:41:29 2018-04-23 16:52:42 4.0 voucher 1.0 18.49 5 1.0 31cddbc370031fbb839a21441df28015 2018-04-27 02:31:45 84.60 8.89 Beauty guaruja SP 4.859812 0.000000 4.437934 3 2 3 323 8 Champions -1 13
1665 1670 fcb4cecebc159e210cd769d5b767d113 aef59fd9f7310b6ea1600876c510024f 22230 rio de janeiro RJ -22.937298 -43.176890 6fa0c125ee7d870f6602c97e33d87bc5 delivered 2018-08-09 19:32:42 2018-08-14 15:43:00 1.0 debit_card 1.0 45.71 4 1.0 70e0c10acc3dd72e17f2136ddb027317 2018-08-15 16:31:14 31.99 15.32 Miscellaneous sao paulo SP 2.995732 0.000000 3.465423 4 3 1 431 8 Champions -3 2
1794 1799 0e6c54c40385920c74542f153754aca6 4ee553fa4da24e46926241e4a67812b0 6719 cotia SP -23.623451 -46.937038 4e16f71b2a5ddc0266f9ddfdfd0bcfba delivered 2018-07-22 18:58:10 2018-07-24 15:09:00 1.0 boleto 1.0 134.63 5 1.0 595fac2a385ac33a80bd5114aec74eb8 2018-07-26 06:05:30 118.70 15.93 Beauty belo horizonte SP 3.637586 0.000000 4.776599 4 2 3 423 9 Can't Lose Them -1 9
1907 1913 3a412111e4f293539eae05b502fc63de f7c20fc3dc5222bc4389740958175652 8220 sao paulo SP -23.537410 -46.466583 e73fe43cdcd166f7f0c6e3c2bf11a917 delivered 2018-08-09 18:06:43 2018-08-14 14:48:00 1.0 debit_card 1.0 56.50 5 1.0 544c14365db76307e0820a1468ed8114 2018-08-15 16:31:14 49.90 9.10 Home Decor santo andre SP 2.995732 0.000000 3.910021 4 4 2 442 10 Can't Lose Them -6 -5
4425 4436 1d12f3eea210892c245c9be2bd17a93c b1f7aa0f8737c9e24aa42df4a5b1f986 9820 sao bernardo do campo SP -23.740897 -46.555203 6d1832bd42dccbd91d1ce32da7f54492 delivered 2018-07-01 12:02:12 2018-07-03 19:25:00 1.0 boleto 1.0 138.46 5 1.0 10698a66d71ebd12be9e8c39302c74b3 2018-07-09 07:30:58 129.97 8.49 pet_shop mogi guacu SP 4.094345 0.000000 4.867304 4 3 3 433 10 Can't Lose Them -1 10
11820 11847 aa80dff2d54c89d3bc5040a9dbe5108c 5b7724b62e5a4740a84759766904e425 8440 sao paulo SP -23.533543 -46.410418 fa962e76e50f3469ae2abfa54e6d5be0 delivered 2018-07-22 13:07:02 2018-07-24 14:38:00 1.0 boleto 1.0 51.27 5 1.0 6e7068f054aef49b1900247c5c1b752b 2018-07-26 03:45:19 43.70 7.57 Miscellaneous sao paulo SP 3.663562 0.000000 3.777348 4 2 2 422 8 Champions -3 3
12313 12343 bcec3938d91ba9a3bf2c4261a44cb9f3 6cc12c7fe99ee89f0c685359fa81868a 81020 curitiba PR -25.489785 -49.287778 ecd1661658d34312cf9ac2bf34ba3b6b delivered 2018-06-30 18:49:36 2018-07-03 11:46:00 1.0 boleto 1.0 51.44 5 1.0 3bd016185be10ea4b4b2baa6e7dd9e9b 2018-07-12 04:30:47 35.00 16.44 Books sao paulo SP 4.094345 0.000000 3.555348 4 2 1 421 7 Loyal -1 24
12675 12705 0ddf8285e8d523864efdb434122f13fb bd1e9f58ceff377262f6ecb84b6a7656 5754 sao paulo SP -23.619760 -46.754030 be7d02ed4f9ec9a43ac0f6adade17ade delivered 2018-07-03 01:55:39 2018-07-03 19:26:00 1.0 credit_card 2.0 112.42 5 1.0 4fe644d766c7566dbc46fb851363cb3b 2018-07-11 02:31:16 99.99 12.43 Miscellaneous salto SP 4.060443 0.000000 4.605070 4 3 3 433 10 Can't Lose Them -2 14
12846 12876 53bb5604662705879bbe3f11a25bda0f 032573996a6b19166a327f7d4c7eda5b 2222 sao paulo SP -23.482556 -46.575586 5e981569f5835c96e4b288363b3b8f63 delivered 2018-07-30 16:25:37 2018-07-31 13:42:00 1.0 credit_card 3.0 182.81 5 1.0 d812b354e25b2bd51eccbb4c4e8f05f1 2018-08-01 19:31:54 169.89 12.92 Baby sao roque SP 3.401197 0.000000 5.135151 4 1 4 414 9 Can't Lose Them -1 -1
16392 16429 09425ea1839abf2f0d289a0ff453fa21 10c573ba276f9857dc6820e0dde57fb8 14811 araraquara SP -21.781324 -48.143821 0184d4ddb259e1a4cfc2871888cf97b8 delivered 2017-09-01 20:04:28 2017-09-04 14:05:50 1.0 credit_card 8.0 91.16 5 1.0 aa8627a375771ab01288705307ec4ae5 2017-09-07 22:31:42 75.00 16.16 Home Decor ipaussu SP 5.891644 0.000000 4.317488 1 1 2 112 4 Needs Attention -5 6
16813 16851 16b084ebbbdd6de39ed8789dbdb79792 1ecf2cead37e9a5cc687b95e79811e20 13208 jundiai SP -23.194814 -46.896799 d836abb4444d8594455e9766104e958c delivered 2018-07-22 17:23:48 2018-07-24 15:11:00 1.0 boleto 1.0 63.55 5 1.0 8d139b1550c8cc91a3babc9cfe9fc147 2018-07-26 04:05:28 55.00 8.55 Electronics and Tech itapecerica da serra SP 3.637586 0.000000 4.007333 4 1 2 412 7 Loyal -3 3
17952 17994 b74ca180d63f9ae0443e4e13a2f5bdaf 398957fce766920db4c4b206d7c9aa85 9271 santo andre SP -23.641861 -46.500728 f222c56f035b47dfa1e069a88235d730 delivered 2018-01-30 09:43:45 2018-01-31 19:48:44 1.0 boleto 1.0 98.62 5 1.0 16ed6a6e3fce23b741650437fe58d65b 2018-02-06 02:35:36 89.18 9.44 Home Decor sao paulo SP 5.356586 0.000000 4.490657 3 1 3 313 7 Loyal -4 10
18955 18998 dfa8547fa65cf8971c91c4102eba6180 5a821a6e169359c29d05cdf715f4c3d4 3920 sao paulo SP -23.597730 -46.526524 5b9e437b9b8c217c1e35158aeafc2102 delivered 2018-04-20 00:23:34 2018-04-23 15:06:33 1.0 debit_card 1.0 33.16 5 1.0 52e5fdcb5e51164483d584c75bd3a478 2018-04-26 17:31:08 25.77 7.39 Electronics and Tech sao paulo SP 4.882802 0.000000 3.249211 3 2 1 321 6 Potential -1 12
19236 19279 f831c1fa80308975ec2b58e4877328e0 d121ceaccf4c241eae9a5f97cc661ccf 13610 leme SP -22.189204 -47.388202 1fab4ac9d85079b3da72a11475ae1685 delivered 2017-09-01 19:04:22 2017-09-04 13:10:23 1.0 credit_card 8.0 243.01 5 1.0 7a10781637204d8d10485c71a6108a2e 2017-09-07 22:31:41 229.90 13.11 Fashion guariba SP 5.891644 0.000000 5.437644 1 4 4 144 9 Can't Lose Them -6 6
19287 19330 3cb502fcf455deaa130fe625b29c7804 b6e2e472b325c10e40784bc2ec3e2c5d 4719 sao paulo SP -23.632941 -46.706958 5f827831438fdab57e6b6b98328b360e delivered 2018-04-21 19:28:09 2018-04-23 22:38:25 1.0 credit_card 3.0 136.77 5 1.0 3096d513ef378c2f6b32b7f4f42a471a 2018-04-26 20:30:46 129.00 7.77 Fashion sao paulo SP 4.867534 0.000000 4.859812 3 3 3 333 9 Can't Lose Them -1 15
21838 21888 6f40052a8d45f8396c7446d4cea1826f 00950dbc783342e164c24e09023ebcb7 7081 guarulhos SP -23.443467 -46.553221 b038f6ae89bdd1fddd3b51ace3ff8b30 delivered 2018-07-03 15:48:48 2018-07-04 11:19:00 1.0 credit_card 1.0 89.10 5 1.0 990d135e28e075648cb7d83198fdccf4 2018-07-09 16:31:02 80.38 8.72 Baby sao paulo SP 4.043051 0.000000 4.386765 4 1 3 413 8 Champions -1 10
25619 25679 c78088a2467e83cbb726c1283608dca7 0e09770453aa9d84c1c5db77a4bf3650 9371 maua SP -23.669873 -46.476213 a49dc0169f9bb6d3ac9829bdebd12299 delivered 2018-07-21 11:46:28 2018-07-24 14:55:00 1.0 boleto 1.0 78.64 5 1.0 d04857e7b4b708ee8b8b9921163edba3 2018-07-26 04:05:38 69.99 8.65 Electronics and Tech sao paulo SP 3.688879 0.000000 4.248352 4 1 2 412 7 Loyal -2 4
28753 28820 345a1ac3ca53330b45af176810ac91a6 e7c3d1c547decbf4755ea2bac75b314d 4006 sao paulo SP -23.577032 -46.647022 3a41d0e227b4c6fff055561af5eaca13 delivered 2018-07-03 15:00:40 2018-07-04 15:11:00 1.0 credit_card 2.0 180.33 5 1.0 80096496b2a02d4fff129b5e6a8f047e 2018-07-09 15:31:39 161.99 18.34 sports_leisure sao paulo SP 4.043051 0.000000 5.087535 4 4 4 444 12 Can't Lose Them -1 10
30008 30075 5b70139504198431b1a8465887ec205b 04861081968535f2e788c4ff33e742a2 13480 limeira SP -22.570721 -47.403095 d15a4c4d1426b7275d538063d8fe4c6a delivered 2018-04-20 15:45:31 2018-04-23 16:48:48 1.0 credit_card 4.0 308.28 5 1.0 a68149d568e07ca72477faf0b10906e1 2018-04-26 16:31:25 299.00 9.28 Baby indaiatuba SP 4.875197 0.000000 5.700444 3 1 4 314 8 Champions -1 5
32386 32455 814d433efda6ac4b0859b5a71aac64c8 a4312240c4796f6045b2cc574b5afb2b 13285 vinhedo SP -23.035543 -47.005773 fcbf4f4ef049367f9f85af94ed3b6010 delivered 2018-04-20 15:11:15 2018-04-23 21:58:45 1.0 credit_card 3.0 38.29 4 1.0 cd322544c79e58e64b103bdceccc507e 2018-04-26 15:31:04 30.90 7.39 sports_leisure sao paulo SP 4.875197 0.000000 3.430756 3 3 1 331 7 Loyal -1 5
33785 33857 a4bd2fc7bf4fc0061011ef71a2208d63 dd6df6bc626f435236526c7f50b43d26 12940 atibaia SP -23.114731 -46.552881 9c3186381b733d4304e2e416afc6bbc1 delivered 2018-07-28 20:49:05 2018-07-31 14:09:00 1.0 credit_card 1.0 210.08 5 1.0 8d7824fad74013f01b11d646e9b94729 2018-08-03 16:10:19 199.99 10.09 sports_leisure amparo SP 3.465736 0.000000 5.298267 4 4 4 444 12 Can't Lose Them -2 0
35667 35741 42f75f85ea5abce880a33720e1334d02 e4019dc45b2d51954e9cca5b62c83bb2 11075 santos SP -23.954600 -46.337731 f5234dc943e266dc8922b4870dc9d491 delivered 2018-07-02 09:29:17 2018-07-03 16:52:00 1.0 boleto 1.0 97.79 5 1.0 3c2417f0f42bc3de63d08893b06571e9 2018-07-09 07:31:20 89.90 7.89 toys sao paulo SP 4.077537 0.000000 4.498698 4 4 3 443 11 Can't Lose Them -2 10
36735 36811 a0a77214faef45bda6ff573f33f39f2c d7c6335139cb055ed0ce1125ddc788d8 4270 sao paulo SP -23.591459 -46.613946 f6f0b2497c5a4ca89670186757ab2684 delivered 2017-06-26 12:23:32 2017-06-27 17:03:09 1.0 credit_card 1.0 79.20 5 1.0 44dfeb8491daf22683dff46c7a3265b9 2017-06-30 13:32:11 65.00 14.20 Home Decor curitiba PR 5.616771 0.693147 5.302857 2 4 4 244 10 Can't Lose Them -2 12
42452 42545 7aa78ec38a25e08bf82617d92a583d0f 995e3863a393f3465e260e1ad265e7eb 2415 sao paulo SP -23.471948 -46.631370 8b83251dbc9bd02530320cceb82cc03e delivered 2018-04-20 12:33:21 2018-04-23 18:12:48 1.0 boleto 1.0 44.38 4 1.0 1de85e3b5a43099a57d10a7d3dedf97e 2018-04-26 03:30:43 36.99 7.39 Construction sao paulo SP 4.882802 0.000000 3.610648 3 3 1 331 7 Loyal -1 7
42529 42622 1755fad7863475346bc6c3773fe055d3 a5314ac290a8b141491e987ae37aa7cc 13454 santa barbara d'oeste SP -22.740602 -47.375821 58d4c4747ee059eeeb865b349b41f53a delivered 2018-07-21 12:49:32 2018-07-24 12:57:00 1.0 boleto 1.0 35.36 5 1.0 1613b819ab5dae53aead2dbb4ebdb378 2018-07-26 03:45:26 27.90 7.46 Miscellaneous atibaia SP 3.688879 0.000000 3.328627 4 3 1 431 8 Champions -1 4
43714 43807 2e1a3155fdbee565b7ad55cb75ce8f78 ae7f543e349dd405a2ff5d6ea850cb8d 6414 barueri SP -23.493552 -46.871704 6df6c9c9af6ef75b4f06f8a7b9f47e9c delivered 2018-07-23 08:50:48 2018-07-24 14:47:00 1.0 boleto 1.0 28.10 5 1.0 e8a1dffbef0392ef084cca41655a13d1 2018-07-26 04:25:19 19.80 8.30 Construction ribeirao pires SP 3.637586 0.000000 2.985682 4 3 1 431 8 Champions -3 2
46609 46710 63162a081407c40769b8b941dbbc7275 52a7ee2fff9bca157d19972ad5825ed2 6702 cotia SP -23.581605 -46.929404 0e1501cd30758f584d1fa67cc6f9493a delivered 2018-07-02 16:16:03 2018-07-03 14:36:00 1.0 boleto 1.0 136.79 5 1.0 5a57a59c44429be19e1ce8e69e15c473 2018-07-06 04:32:02 120.00 16.79 Home Decor limeira SP 4.060443 0.000000 4.787492 4 2 3 423 9 Can't Lose Them -1 11
47843 47949 90b8af517fbab96fb08d0115dffdc570 f900fc2f1cd425f450ed45de58815199 3433 sao paulo SP -23.560139 -46.534186 0467205a89711e4ec8e70ef2277e3287 delivered 2018-07-03 10:30:52 2018-07-03 13:12:00 1.0 credit_card 1.0 118.83 5 1.0 f5badee8a0e679f057ec0ca4f0f32dcb 2018-07-05 11:31:12 109.90 8.93 Construction sao paulo SP 4.060443 0.000000 4.699571 4 4 3 443 11 Can't Lose Them -1 10
48332 48440 2e3f19e3763da38dd82c6e2e9ff69c98 a22964f39cb8758996ec42f5fb5a7028 2422 sao paulo SP -23.471749 -46.636773 a727355acb88d9b3e6e41fb2e3888a0e delivered 2018-07-23 12:58:28 2018-07-24 15:07:00 1.0 boleto 1.0 27.30 4 1.0 ab1f9387c0627dd24000bfbc54fdeee0 2018-07-26 06:10:16 19.90 7.40 Electronics and Tech sao paulo SP 3.637586 0.000000 2.990720 4 3 1 431 8 Champions -4 2
50416 50529 babb58dd58d643c769e12f18288b3214 4f29941f07cdc599d638e5c323ecf4cd 5181 sao paulo SP -23.455861 -46.744092 5a41aefdf8010bbd69a5264f69213b73 delivered 2018-07-02 16:13:00 2018-07-03 12:40:00 1.0 boleto 1.0 40.39 5 1.0 3af6d5f9fdb78f106c003ce49d7f0186 2018-07-05 04:30:38 32.90 7.49 Beauty sao paulo SP 4.060443 0.000000 3.493473 4 2 1 421 7 Loyal -1 10
52265 52378 9f64391c68c234a7264087c2d0c1ac76 2d32acd3ec65cdd1da5ef425e03d0fda 38406 uberlandia MG -18.875821 -48.231897 40de47dfa620d667117e4a6067b6e1ec delivered 2017-09-01 20:05:55 2017-09-04 20:36:58 1.0 credit_card 3.0 160.50 5 3.0 d678178aa4291cd25a755a90188375c8 2017-09-10 22:33:43 38.40 15.10 Home Decor sao paulo SP 5.891644 0.000000 3.648057 1 1 1 111 3 Require Activation -6 8
52358 52472 67a5fd09c67dd51115ca8044b4afdc34 1dd809dce0726b91fe230b6cbb0bcf42 13770 caconde SP -21.528684 -46.645051 c3b8c17ee15e0e798c2e178b7d4c7f42 delivered 2017-09-01 20:04:47 2017-09-05 15:43:31 1.0 credit_card 1.0 61.59 3 1.0 1d6d7be70107889e3193819aca4dc1dc 2017-09-07 22:31:45 48.90 12.69 Home Decor ibitinga SP 5.891644 0.000000 3.889777 1 1 2 112 4 Needs Attention -3 6
57476 57597 b3ecc2bd81952e83915d0df692cca31f bbed51cbb106c22420a76daf801daaa2 1529 sao paulo SP -23.568439 -46.634555 641879e47a518bfcd257cdedda69f029 delivered 2018-04-21 22:03:59 2018-04-23 19:04:12 1.0 credit_card 2.0 23.77 5 1.0 a027d4d9a0ffc6b3c2cf45636b044078 2018-04-25 22:31:37 15.90 7.87 Home Decor salto SP 4.867534 0.000000 2.766319 3 3 1 331 7 Loyal -1 15
58586 58711 2dda54e25d0984e12705c84d4030e6e0 71692b73230707ce0f95d8a6091c22b4 27946 macae RJ -22.366786 -41.793413 6e57e23ecac1ae881286657694444267 delivered 2018-08-09 17:36:47 2018-08-14 13:26:00 1.0 debit_card 1.0 333.91 3 1.0 bb50f2e236e5eea0100680137654686c 2018-08-15 16:30:58 330.00 20.41 Beauty sao bernardo do campo SP 2.995732 0.000000 5.799093 4 2 4 424 10 Can't Lose Them -3 16
60061 60191 5871240b438e21144634d9fdd480e0d8 4c2b4ad5642c2949886e9c9484a03958 7135 guarulhos SP -23.425556 -46.525292 70e534acd4ff3f46956f7b4b36418474 delivered 2018-07-03 16:51:33 2018-07-04 12:39:00 1.0 credit_card 1.0 79.14 5 1.0 3c5559ffe829dd89665484c98f0c4e18 2018-07-09 17:31:15 69.90 9.24 Electronics and Tech guarulhos SP 4.043051 0.000000 4.247066 4 2 2 422 8 Champions -1 10
60656 60787 30aef21b6dd78da1334dd1164cc0b05e 25d27a4142682b5d504dcc67caab8d6f 4909 sao paulo SP -23.678940 -46.736281 bc4854efd86d9f42140c951c595d20c1 delivered 2017-09-01 20:05:42 2017-09-04 20:49:57 1.0 credit_card 2.0 142.92 5 1.0 657aa59c6c58816ddc4f86c51a2759c9 2017-09-07 22:31:40 129.00 13.92 Baby serra negra SP 5.891644 0.000000 4.859812 1 1 3 113 5 Promising -7 6
61119 61252 f73735edd848cf439de47e82b57abce2 a9b82c1107a9c2e53e36281aa2b6f0c4 13015 campinas SP -22.904214 -47.056549 97823d6731e59cd6b4f9d3ea113a976e delivered 2018-07-23 11:26:36 2018-07-24 07:14:00 1.0 credit_card 3.0 30.69 5 1.0 cce679660c66e6fbd5c8091dfd29e9cd 2018-07-30 13:44:43 17.90 12.79 Home Decor ibitinga SP 3.637586 0.000000 2.884801 4 3 1 431 8 Champions -2 11
64626 64772 d62eee7d44db15b7b56c01f70fbb2e4c d76054856da7df9a8b71a9f4ac9ddcbe 11705 praia grande SP -24.036617 -46.504863 59c51bdf0c3270d962625e5ebbe22fb1 delivered 2018-06-29 05:26:29 2018-07-03 19:29:00 1.0 boleto 1.0 28.20 5 1.0 5099f7000472b634fea8304448d20825 2018-07-09 04:31:19 19.90 8.30 Beauty sao paulo SP 4.127134 0.000000 2.990720 4 4 1 441 9 Can't Lose Them -2 7
64738 64884 46777be868cf25df4587888868aaba43 7e492b1caa3a7f4b2cde0d3f2316ae2b 4123 sao paulo SP -23.610422 -46.626594 9a6327d5a077984fa3e3c659a24e7915 delivered 2018-04-20 15:30:13 2018-04-23 17:56:55 1.0 debit_card 1.0 29.74 1 1.0 47376be1404bbe927766e8a9011714cb 2018-04-26 16:31:44 22.35 7.39 Miscellaneous guarulhos SP 4.875197 0.000000 3.106826 3 2 1 321 6 Potential -1 13
66260 66412 d5485a1a47978e2963a3479a5bef5155 1af490ec91fd6b72639388c4208129dc 9060 santo andre SP -23.665536 -46.549115 746e0019b6fc6170534f81e45e8d55ec delivered 2018-04-22 22:48:12 2018-04-24 01:06:34 1.0 credit_card 2.0 74.78 4 1.0 8e761c0d91fd28b0937e63238852f2f8 2018-04-26 23:30:51 65.90 8.88 Miscellaneous guarulhos SP 4.859812 0.000000 4.188138 3 1 2 312 6 Potential -1 15
66340 66492 39f7d57021887ffed8775b92df685caf eb25b57ed8605aecea925d5680d2a05f 1529 sao paulo SP -23.568439 -46.634555 dd46ac8cf8a385a883f38119228c584e delivered 2018-07-21 15:01:31 2018-07-24 15:31:00 1.0 boleto 1.0 28.20 5 1.0 04262a7ed71aa34edfde0f8429598505 2018-07-27 04:05:26 19.90 8.30 Electronics and Tech sao paulo SP 3.663562 0.000000 2.990720 4 4 1 441 9 Can't Lose Them -1 5
67563 67722 4ac6bbe5dcaf264863bff08692d00578 b88f3063be88d1ecc23e44d000d5f639 35703 sete lagoas MG -19.436003 -44.205254 70f357cca87c1162357bf3c0a993cbe5 delivered 2017-09-01 18:40:11 2017-09-05 12:33:19 1.0 credit_card 7.0 75.65 5 1.0 7c1bd920dbdf22470b68bde975dd3ccf 2017-09-10 22:34:00 58.99 16.66 Beauty santo andre SP 5.891644 0.000000 4.077368 1 3 2 132 6 Potential -2 8
68113 68273 e17005589b4ea116d5d37e99073c8f91 f6b25feec9492b2e3f4ddfec018c4c07 13348 indaiatuba SP -23.116399 -47.237827 ced9d48cd170ccf0a762bf379b4723fc delivered 2018-04-20 15:52:59 2018-04-23 23:12:34 1.0 credit_card 2.0 76.39 5 1.0 44d097d59e8430f88a67517cd0c4f865 2018-04-26 16:30:58 69.00 7.39 Fashion sao paulo SP 4.875197 0.000000 4.234107 3 4 2 342 9 Can't Lose Them -1 15
68765 68926 e14b08a5e109d77846ce7d4cb8b12d6c 799f96a722913e43fa42d9b0b49ae861 7110 guarulhos SP -23.467220 -46.526394 547b3596c03dfad1b87305e9c5a554c2 delivered 2018-04-22 23:05:38 2018-04-23 19:04:05 1.0 credit_card 3.0 124.93 1 3.0 457cd0a99cfa977993e4eb6d98d95abb 2018-04-26 23:30:52 39.99 0.01 Electronics and Tech sao paulo SP 4.859812 0.000000 3.688629 3 2 1 321 6 Potential -1 15
68932 69094 0e030dda2e8d63beb191f52346ac0fbe 27a12bf46bf13e96f6dc1d15ff2fce12 6429 barueri SP -23.489688 -46.952915 d6e3732aea036f7d71b44db75163e2f4 delivered 2018-07-03 09:45:53 2018-07-04 14:03:00 1.0 credit_card 3.0 182.81 5 1.0 5418df20cd587e12212d54c1d5d990fb 2018-07-09 10:31:06 169.89 12.92 Baby sao roque SP 4.060443 0.000000 5.135151 4 1 4 414 9 Can't Lose Them -1 14
71142 71313 c6e541df5fc7e101c333f2294d63b052 4a4bda06e9a59aef06227b39ff05b36a 4910 sao paulo SP -23.684506 -46.738067 6d1f16f4ed292206fbeedf3b44e0ceb3 delivered 2018-07-01 14:15:20 2018-07-03 19:26:00 1.0 boleto 1.0 37.69 5 1.0 e0cf79767c5b016251fe139915c59a26 2018-07-05 04:31:42 29.90 7.79 Beauty piracicaba SP 4.094345 0.000000 3.397858 4 2 1 421 7 Loyal -1 10
71517 71688 dbbbdb6ab9a7d8e3fc39a907f456bf19 c1dffa0ed8695e4823f90ae4550e336e 5412 sao paulo SP -23.556761 -46.680415 4387477eec4b3c89b39f3f454940d059 delivered 2018-08-09 20:45:10 2018-08-14 12:35:00 1.0 debit_card 1.0 222.02 4 2.0 818b2f7d6fe6abc4575e94c7dcccfcc4 2018-08-15 16:30:53 99.00 16.96 Home Decor sao paulo SP 2.995732 0.000000 4.595120 4 3 3 433 10 Can't Lose Them -6 -7
74408 74583 68b476a4ea762d1de4bc7eea1942cc93 e887a80ecfe48b186f02c0cb0cc1168c 5339 sao paulo SP -23.554474 -46.755488 cb0909932c118481a0e44396d80ab2e8 delivered 2018-04-22 21:15:32 2018-04-23 23:21:10 1.0 credit_card 5.0 68.87 5 1.0 fa7be99321f2dbb1e518d2139ccb1d39 2018-04-26 21:31:01 59.99 8.88 sports_leisure santa barbara d'oeste SP 4.859812 0.000000 4.094178 3 4 2 342 9 Can't Lose Them -1 15
78151 78333 88eddac47472e08eb17d96a25969a655 2a0ac21838f4a0a00c767bbed07de79a 7196 guarulhos SP -23.454106 -46.511503 112b993827414a482ef6534e251fad3a delivered 2018-01-29 17:26:51 2018-02-01 18:48:38 1.0 boleto 1.0 117.86 5 2.0 57c580a8fbf3d8374b06bee7b6f4ce31 2018-02-06 02:35:41 49.90 9.03 Home Decor santa barbara d'oeste SP 5.356586 0.000000 3.910021 3 1 2 312 6 Potential -2 10
79026 79213 be274b149cc9bb8af13628e268867c10 3e3d56f9737c010ccb5b67c601a18b53 7130 guarulhos SP -23.442493 -46.520640 66e1b657a71397245290f39ffe24031e delivered 2018-07-22 15:09:37 2018-07-24 19:07:00 1.0 boleto 1.0 52.58 5 2.0 c985e917daf44dfe2779833209017098 2018-07-27 04:05:26 18.90 7.39 Home Decor limeira SP 3.401197 0.693147 5.117335 4 4 4 444 12 Can't Lose Them -3 4
79481 79668 ba0660bf3fffe505ee892e153a2fbd49 cd2624b9b75d83efb640f7204232511e 4753 sao paulo SP -23.654350 -46.713789 4df92d82d79c3b52c7138679fa9b07fc delivered 2018-07-24 11:32:11 2018-07-26 14:46:00 2.0 voucher 1.0 100.00 5 1.0 de533fe5b9448ea9792195806938dbf5 2018-07-31 12:55:27 196.80 12.40 Home Decor sao paulo SP 3.610918 0.000000 5.282188 4 4 4 444 12 Can't Lose Them -3 7
81328 81518 10d109b1f1f45bd2bf9afd83543eedd4 ac4316047f392ac02340c4b6160d95a8 2071 sao paulo SP -23.504279 -46.606428 394d17c2b71a726e205caaeee3d2aa3d delivered 2018-08-03 14:56:26 2018-08-06 11:51:00 1.0 credit_card 1.0 57.50 5 1.0 dc52f0f5d3ec37a93eaf956cde4e5d2c 2018-08-07 15:10:09 49.00 8.50 Fashion sao paulo SP 3.295837 0.000000 3.891820 4 3 2 432 9 Can't Lose Them -2 -1
83626 83819 8d10ccfe7c979ef982e4964bbff62637 8d43f7b8f33f60fb6ae5ec21072135b5 82970 curitiba PR -25.473759 -49.203485 6dcf0aeb8b1eb4021c26e1d0e9394979 delivered 2018-08-09 20:37:34 2018-08-14 13:05:00 1.0 debit_card 1.0 318.97 1 1.0 f61f6e53f9c6d150e5e48919a25272d1 2018-08-15 16:31:14 299.00 34.92 Home Decor curitiba PR 2.995732 0.000000 5.700444 4 3 4 434 11 Can't Lose Them -5 -6
85031 85230 2fedecfd993b8b3fa889d00eee230748 4687b78e990956109c4019200e3ac062 19906 ourinhos SP -22.984706 -49.881835 6b80bb20190715d71c43efff617bd659 delivered 2017-02-19 01:15:03 2017-02-22 16:05:29 1.0 boleto 1.0 128.10 5 1.0 fa0125a234fa85a755371fa0ea37361a 2017-02-23 01:15:03 115.90 12.20 Home Decor louveira SP 6.322565 0.000000 4.752728 1 2 3 123 6 Potential -5 15
88206 88414 2b7fff075bda701552485ef3f0810257 0a230b3fdaae7f47c5a5c39a4b73a340 2992 sao paulo SP -23.448821 -46.728764 cf72398d0690f841271b695bbfda82d2 delivered 2017-09-01 18:45:33 2017-09-04 20:12:41 1.0 credit_card 8.0 284.49 5 1.0 89bbac967e47033f5d8c15527de1ef9f 2017-09-07 22:34:28 269.00 15.49 Home Decor osasco SP 5.891644 0.000000 5.594711 1 1 4 114 6 Potential -3 7
88907 89116 87bd31960d1a15744a96ea9fc9a95b8c 3c16b3dab6262b2f1ff7f809a9afa8ec 5805 sao paulo SP -23.653323 -46.734511 fab28f3d1cfbe171566884703d82682e delivered 2018-04-21 09:18:13 2018-04-23 23:35:37 1.0 credit_card 1.0 20.75 5 1.0 d2f5484cbffe4ca766301b21ab9246dd 2018-04-26 09:30:40 12.88 7.87 Electronics and Tech santos SP 4.875197 0.000000 2.555676 3 1 1 311 5 Promising -1 9
90308 90518 788e845925ff64c9df5d8ba40e28cf50 fbd4d7e33bef85ee6e10995ad94ef88a 62230 ipueiras CE -4.543940 -40.713632 8554cb37f7158cb0b082a841d24a4589 delivered 2017-09-01 18:40:44 2017-09-04 19:12:19 1.0 credit_card 10.0 1299.28 1 2.0 a042f8bfc21bceff9db6313d2f98aefc 2017-09-07 22:32:18 799.99 30.64 Fashion barueri SP 5.891644 0.000000 6.684599 1 4 4 144 9 Can't Lose Them -6 18
91749 91965 e49cb31672822de07bf6f6b0899ee58f d4e50256ef063395ef57726e6b289473 4044 sao paulo SP -23.600609 -46.639784 1809078b655469f11b429102fb296998 delivered 2018-04-20 20:22:34 2018-04-23 18:18:27 1.0 boleto 1.0 15.77 5 1.0 f60e19fc3b0ee38735e6a6fefe937bdf 2018-04-26 02:31:45 7.90 7.87 Electronics and Tech sorocaba SP 4.875197 0.000000 2.066863 3 4 1 341 8 Champions -1 15
93571 93790 e24fceba7ac4b1b1cd1884f05ba68e9d 7410c7ed56c93d4eee17a7b7b3a75b3e 13056 campinas SP -22.975100 -47.142925 4f3a6e28d764cf896b1fceb0028422c8 delivered 2018-07-03 09:34:16 2018-07-03 16:52:00 1.0 credit_card 2.0 122.05 5 1.0 5563f6b699b362e1bd6a2e1e650f52bd 2018-07-09 10:30:46 108.99 13.06 Books sao paulo SP 4.060443 0.000000 4.691256 4 2 3 423 9 Can't Lose Them -1 14
In [55]:
df = df.loc[df["nb_days_to_deliver"] > 0]
In [56]:
df[df["nb_days_estimated_to_deliver"] < 0]
Out[56]:
index customer_id customer_unique_id customer_zip_code_prefix customer_city customer_state geolocation_lat geolocation_lng order_id order_status order_purchase_timestamp order_delivered_carrier_date payment_sequential payment_type payment_installments payment_value review_score order_item_id product_id shipping_limit_date price freight_value product_category_name seller_city seller_state recency frequency monetary R F M RFM_Segment_Concat RFM_Score RFM_Level nb_days_to_deliver nb_days_estimated_to_deliver
8814 8836 9597de20c8b55d0f22dcc26fdaa6b2f1 297f25d165987a0979a138aa7529d19d 6764 taboao da serra SP -23.61583 -46.768533 9675440ebf61a1a3482cc6308e3ebd28 delivered 2018-08-18 23:35:23 2018-08-27 16:14:00 1.0 credit_card 8.0 82.79 2 1.0 6f0f907c9cf458139bac7f01777b9bbf 2018-08-27 22:05:08 75.0 7.79 Fashion sao paulo SP 2.397895 0.0 4.317488 4 1 2 412 7 Loyal 3 -2
In [57]:
df = df.loc[df["nb_days_estimated_to_deliver"] > 0]
In [58]:
df.head()
Out[58]:
index customer_id customer_unique_id customer_zip_code_prefix customer_city customer_state geolocation_lat geolocation_lng order_id order_status order_purchase_timestamp order_delivered_carrier_date payment_sequential payment_type payment_installments payment_value review_score order_item_id product_id shipping_limit_date price freight_value product_category_name seller_city seller_state recency frequency monetary R F M RFM_Segment_Concat RFM_Score RFM_Level nb_days_to_deliver nb_days_estimated_to_deliver
0 0 06b8999e2fba1a1fbc88172c00ba8bc7 861eff4711a542e4b93843c6dd7febb0 14409 franca SP -20.498489 -47.396929 00e7ee1b050b8499577073aeb2a297a1 delivered 2017-05-16 15:05:35 2017-05-23 10:47:57 1.0 credit_card 2.0 146.87 4 1.0 a9516a079e37a9c9c36b9b78b10169e8 2017-05-22 15:22:12 124.99 21.88 Home Decor itaquaquecetuba SP 6.152733 0.0 4.828234 1 3 3 133 7 Loyal 8 19
1 1 18955e83d337fd6b2def6b18a428ac77 290c77bc529b7ac935b93aa66c333dc3 9790 sao bernardo do campo SP -23.727992 -46.542848 29150127e6685892b6eab3eec79f59c7 delivered 2018-01-12 20:48:24 2018-01-15 17:14:59 1.0 credit_card 8.0 335.48 5 1.0 4aa6014eceb682077f9dc4bffebc05b0 2018-01-18 20:58:32 289.00 46.48 Home Decor itajai SC 5.433722 0.0 5.666427 2 1 4 214 7 Loyal 16 24
2 2 4e7b3e00288586ebd08712fdd0374a03 060e732b5b29e8181a18229c7b0b2b5e 1151 sao paulo SP -23.531642 -46.656289 b2059ed67ce144a36e2aa97d2c9e9ad2 delivered 2018-05-19 16:07:45 2018-06-11 14:31:00 1.0 credit_card 7.0 157.73 5 1.0 bd07b66896d6f1494f5b86251848ced7 2018-06-05 16:19:10 139.94 17.79 Home Decor itaquaquecetuba SP 4.624973 0.0 4.941214 4 1 3 413 8 Champions 25 23
3 3 b2b6027bc5c5109e529d4dc6358b12c3 259dac757896d24d7702b9acbbff3f3c 8775 mogi das cruzes SP -23.499702 -46.185233 951670f92359f4fe4a63112aa7306eba delivered 2018-03-13 16:06:38 2018-03-27 23:22:42 1.0 credit_card 1.0 173.30 5 1.0 a5647c44af977b148e0a3a4751a09e2e 2018-03-27 16:31:16 149.94 23.36 Home Decor itaquaquecetuba SP 5.129899 0.0 5.010235 3 1 4 314 8 Champions 14 27
4 4 4f2d8ab171c80ec8364f7c12e35b23ad 345ecd01c38d18a9036ed96c73b8d066 13056 campinas SP -22.975100 -47.142925 6b7d50bd145f6fc7f33cebabd7e49d0f delivered 2018-07-29 09:51:30 2018-07-30 15:16:00 1.0 credit_card 8.0 252.25 5 1.0 9391a573abe00141c56e38d84d7d5b3b 2018-07-31 10:10:09 230.00 22.25 Home Decor ibitinga SP 3.465736 0.0 5.438079 4 1 4 414 9 Can't Lose Them 11 16
In [59]:
df.to_csv("Dataset/data_stability.csv")
In [60]:
df.boxplot('nb_days_to_deliver', by='RFM_Level',  figsize=(12, 8))
plt.xticks(rotation=90)
plt.title('Nombre de jour de livraison commande par classe RFM')
plt.show()
In [61]:
df_segment = df[["customer_unique_id","recency","frequency","monetary"]]
In [62]:
df_segment.set_index("customer_unique_id", inplace=True)
In [63]:
df_segment.head()
Out[63]:
recency frequency monetary
customer_unique_id
861eff4711a542e4b93843c6dd7febb0 6.152733 0.0 4.828234
290c77bc529b7ac935b93aa66c333dc3 5.433722 0.0 5.666427
060e732b5b29e8181a18229c7b0b2b5e 4.624973 0.0 4.941214
259dac757896d24d7702b9acbbff3f3c 5.129899 0.0 5.010235
345ecd01c38d18a9036ed96c73b8d066 3.465736 0.0 5.438079

Test avec uniquement les variables RFM

Standardisation des données

In [64]:
scaler = StandardScaler()
In [65]:
scaler.fit(df_segment)
rfm_normalized = scaler.transform(df_segment)

Réduction de dimensions

La réduction de dimension a pour but de faciliter la visualisation de nos données en les ramenant dans un espace en 2 dimensions.

In [66]:
# PCA Pipeline
pca = PCA(svd_solver='full')
pca.fit(rfm_normalized)
data_pca = pca.transform(rfm_normalized)
In [67]:
# Explained variance
varexpl = pca.explained_variance_ratio_*100

# Plot of cumulated variance
plt.figure(figsize=(12,8))
plt.bar(np.arange(len(varexpl))+1, varexpl)

cumSumVar = varexpl.cumsum()
plt.plot(np.arange(len(varexpl))+1, cumSumVar,c="red",marker='o')

valid_idx = np.where(cumSumVar >= 95)[0]
min_plans = valid_idx[cumSumVar[valid_idx].argmin()]+1


plt.xlabel("rang de l'axe d'inertie")
plt.xticks(np.arange(len(varexpl))+1)
plt.ylabel("pourcentage d'inertie")
plt.title("{}% de la variance totale est expliquée"\
          " par les {} premiers axes".format(95,
                                            min_plans))
plt.show(block=False)

On conserve les 3 axes principaux pour expliquer la variance à 95%.

In [68]:
def display_circles(pcs, n_comp, pca, axis_ranks, labels=None, label_rotation=0, lims=None):
    for d1, d2 in axis_ranks: # On affiche les 3 premiers plans factoriels, donc les 6 premières composantes
        if d2 < n_comp:

            # initialisation de la figure
            fig, ax = plt.subplots(figsize=(10,10))

            # détermination des limites du graphique
            if lims is not None :
                xmin, xmax, ymin, ymax = lims
            elif pcs.shape[1] < 30 :
                xmin, xmax, ymin, ymax = -1, 1, -1, 1
            else :
                xmin, xmax, ymin, ymax = min(pcs[d1,:]), max(pcs[d1,:]), min(pcs[d2,:]), max(pcs[d2,:])

            # affichage des flèches
            # s'il y a plus de 30 flèches, on n'affiche pas le triangle à leur extrémité
            if pcs.shape[1] < 30 :
                plt.quiver(np.zeros(pcs.shape[1]), np.zeros(pcs.shape[1]),
                   pcs[d1,:], pcs[d2,:], 
                   angles='xy', scale_units='xy', scale=1, color="grey")
                # (voir la doc : https://matplotlib.org/api/_as_gen/matplotlib.pyplot.quiver.html)
            else:
                lines = [[[0,0],[x,y]] for x,y in pcs[[d1,d2]].T]
                ax.add_collection(LineCollection(lines, axes=ax, alpha=.1, color='black'))
            
            # affichage des noms des variables  
            if labels is not None:  
                for i,(x, y) in enumerate(pcs[[d1,d2]].T):
                    if x >= xmin and x <= xmax and y >= ymin and y <= ymax :
                        plt.text(x, y, labels[i], fontsize='14', ha='center', va='center', rotation=label_rotation, color="blue", alpha=0.5)
            
            # affichage du cercle
            an = np.linspace(0, 2 * np.pi, 100)  # Add a unit circle for scale
            plt.plot(np.cos(an), np.sin(an))
            plt.axis('equal')

            # définition des limites du graphique
            plt.xlim(xmin, xmax)
            plt.ylim(ymin, ymax)
        
            # affichage des lignes horizontales et verticales
            plt.plot([-1, 1], [0, 0], color='grey', ls='--')
            plt.plot([0, 0], [-1, 1], color='grey', ls='--')

            # nom des axes, avec le pourcentage d'inertie expliqué
            plt.xlabel('F{} ({}%)'.format(d1+1, round(100*pca.explained_variance_ratio_[d1],1)))
            plt.ylabel('F{} ({}%)'.format(d2+1, round(100*pca.explained_variance_ratio_[d2],1)))

            plt.title("Cercle des corrélations (F{} et F{})".format(d1+1, d2+1))
            plt.show(block=False)
In [70]:
# Principal component space
pcs = pca.components_

display_circles(pcs, 3, pca, [(0, 1)], labels=np.array(df_segment.columns))

K-Means Clustering

K-Means se base sur des calculs de distance entre les points de notre jeu de données et un point nommé centroïde.

Le KElbowVisualizer implémente la méthode de "coude" pour sélectionner le nombre optimal de clusters en ajustant le modèle avec une plage de valeurs pour 𝐾. Si le graphique linéaire ressemble à un bras, alors le « coude » (le point d'inflexion sur la courbe) est une bonne indication que le modèle sous-jacent s'adapte le mieux à ce point. Dans le visualiseur, "coude" sera annoté par une ligne en pointillés.

Le KElbowVisualizer affiche également le temps de calcul de l'entrainement du modèle de clustering par 𝐾 sous la forme d'une ligne verte en pointillés

In [71]:
# Instantiate the clustering model and visualizer
model = KMeans()
visualizer = KElbowVisualizer(
    model, k=(2,10), metric='calinski_harabasz', timings=True
)

visualizer.fit(data_pca)  # Fit the data to the visualizer
visualizer.show()        # Finalize and render the figure
Out[71]:
<AxesSubplot:title={'center':'Calinski Harabasz Score Elbow for KMeans Clustering'}, xlabel='k', ylabel='calinski harabasz score'>
In [72]:
# Instantiate the clustering model and visualizer
model = KMeans()
visualizer = KElbowVisualizer(model, k=(2,12))

visualizer.fit(data_pca)        # Fit the data to the visualizer
visualizer.show()        # Finalize and render the figure
Out[72]:
<AxesSubplot:title={'center':'Distortion Score Elbow for KMeans Clustering'}, xlabel='k', ylabel='distortion score'>

Selon la méthode du coude basée sur le score de distortion (somme moyenne des carrés des distances aux centres), le nombre de cluster idéal est de 4.

On va donc entraîner notre modèle avec ce paramètre.

In [73]:
# Fitting KMeans
km = KMeans(n_clusters=4, max_iter=50)
In [74]:
# Prediction
label = km.fit_predict(data_pca)
centroids = km.cluster_centers_
u_labels = np.unique(label)

# Graphical representation
plt.figure(figsize=(10, 10))
for i in u_labels:
    plt.scatter(data_pca[label==i, 0], data_pca[label==i, 1], label = i)
plt.scatter(centroids[:, 0], centroids[:,1], s=80, alpha=0.8, color='k')
plt.legend()
plt.title('Représentation des clusters du K-Means')
plt.show()

Silhouette Visualizer

L'analyse de silhouette est utilisée pour évaluer la densité et la séparation entre les clusters. Le score est calculé en faisant la moyenne du coefficient de silhouette pour chaque échantillon, qui est calculé comme la différence entre la distance moyenne des clusters et la distance moyenne pour chaque échantillon, normalisée par la valeur maximale. Cela produit un score compris entre -1 et +1, où les scores proches de +1 indiquent une séparation élevée et les scores proches de -1 indiquent que les échantillons ont peut-être été affectés au mauvais groupe.

Pour vérifier si le Kmeans clustering fonctionne, nous allons utiliser SilhouetteVisualizer pour afficher le coefficient de silhouette pour un échantillonage de chaque cluster. Cela permet de visualiser la densité et la séparation des clusters.

In [76]:
fig, ax = plt.subplots(2, 2, figsize=(15,8))
for i in [2, 3, 4, 5]:
    '''
    Create KMeans instance for different number of clusters
    '''
    k_means_model = KMeans(n_clusters=i, init='k-means++', n_init=10, max_iter=100, random_state=42)
    q, mod = divmod(i, 2)
    '''
    Create SilhouetteVisualizer instance with KMeans instance
    Fit the visualizer
    '''
    visualizer = SilhouetteVisualizer(k_means_model, colors='yellowbrick', ax=ax[q-1][mod])
    visualizer.fit(df_segment)

Le graphique ci-dessus est une comparaison entre 4 scénarios de clustering différents, où chaque scénario représente un certain nombre de clusters, 2 à 5 respectivement. Au fur et à mesure que le nombre de clusters augmente, nous voyons ce qui suit :

le score Silhouette moyen de tous les clusters diminue, représenté par la ligne pointillée verticale, ce qui signifie que nous n'optimisons pas à mesure que nous augmentons le nombre de clusters le saut du score moyen est clair de 2 clusters à 3 clusters (jusqu'à ~ 0,16 de ~ 0,35) indiquant que 2 est la valeur optimale

à mesure que nous augmentons les clusters, plus de points ont tendance à avoir un score Silhouette négatif confirmant que les clusters commencent à devenir moins homogènes on remarque aussi que pour k = 2, le premier cluster est presque trois fois plus grand que la taille (nombre d'instances) du second cluster

In [77]:
from yellowbrick.cluster import SilhouetteVisualizer

visualizer = SilhouetteVisualizer(km, colors='yellowbrick')

visualizer.fit(data_pca)
visualizer.show()
Out[77]:
<AxesSubplot:title={'center':'Silhouette Plot of KMeans Clustering for 93566 Samples in 4 Centers'}, xlabel='silhouette coefficient values', ylabel='cluster label'>
In [78]:
# Kmeans labels
df_segment["cluster"] = km.labels_
df_segment
<ipython-input-78-63139d3d1965>:2: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Out[78]:
recency frequency monetary cluster
customer_unique_id
861eff4711a542e4b93843c6dd7febb0 6.152733 0.0 4.828234 1
290c77bc529b7ac935b93aa66c333dc3 5.433722 0.0 5.666427 1
060e732b5b29e8181a18229c7b0b2b5e 4.624973 0.0 4.941214 1
259dac757896d24d7702b9acbbff3f3c 5.129899 0.0 5.010235 1
345ecd01c38d18a9036ed96c73b8d066 3.465736 0.0 5.438079 3
... ... ... ... ...
1a29b476fee25c95fbafc67c5ac95cf8 4.969813 0.0 4.316154 0
d52a67c98be1cf6a5c84435bd38d095d 4.997212 0.0 4.744062 1
e9f50caf99f032f0bf3c55141f019d99 4.962845 0.0 3.610918 0
73c2643a0a458b49f58cea58833b192e 5.700444 0.0 6.535241 1
84732c5050c01db9b23e19ba39899398 5.537334 0.0 2.638343 0

93566 rows × 4 columns

Analyse des clusters

In [79]:
fig = px.scatter_3d(df_segment, x=df_segment["recency"], y=df_segment["frequency"], z=df_segment["monetary"], color=df_segment["cluster"])
fig.show()
In [80]:
sns.boxplot(x='cluster',y='monetary', data=df_segment)
Out[80]:
<AxesSubplot:xlabel='cluster', ylabel='monetary'>
In [81]:
sns.boxplot(x='cluster',y='frequency', data=df_segment)
Out[81]:
<AxesSubplot:xlabel='cluster', ylabel='frequency'>
In [82]:
sns.boxplot(x='cluster',y='recency', data=df_segment)
Out[82]:
<AxesSubplot:xlabel='cluster', ylabel='recency'>
In [83]:
# Cluster 0
In [84]:
cluster0 = df_segment[df_segment['cluster'].isin([0])]
In [85]:
customers_0 = cluster0.index
df0 = df.loc[df['customer_unique_id'].isin(customers_0)]
df0.shape
Out[85]:
(35755, 36)
In [86]:
df0.head()
Out[86]:
index customer_id customer_unique_id customer_zip_code_prefix customer_city customer_state geolocation_lat geolocation_lng order_id order_status order_purchase_timestamp order_delivered_carrier_date payment_sequential payment_type payment_installments payment_value review_score order_item_id product_id shipping_limit_date price freight_value product_category_name seller_city seller_state recency frequency monetary R F M RFM_Segment_Concat RFM_Score RFM_Level nb_days_to_deliver nb_days_estimated_to_deliver
6 6 fd826e7cf63160e536e0908c76c3f441 addec96d2e059c80c30fe6871d30d177 4534 sao paulo SP -23.584221 -46.674053 36e694cf4cbc2a4803200c35e84abdc4 delivered 2018-02-19 14:38:35 2018-02-20 00:03:39 1.0 credit_card 1.0 22.77 5 1.0 b5466db4cecf95c3c1be0ba32538ce1a 2018-02-23 14:50:37 14.99 7.78 Electronics and Tech sao paulo SP 5.257495 0.0 2.707383 3 3 1 331 7 Loyal 1 13
9 9 9fb35e4ed6f0a14a4977cd9aea4042bb 2a7745e1ed516b289ed9b29c7d0539a5 39400 montes claros MG -16.721976 -43.862914 8428e578bb1cf839ae26a6b7615502b9 delivered 2017-11-27 17:23:20 2017-11-28 23:04:41 1.0 credit_card 1.0 40.40 5 1.0 6b874ff4c71d18e91079b40396bf176a 2017-12-01 18:01:20 25.30 15.10 Miscellaneous piracicaba SP 5.616771 0.0 3.230804 2 1 1 211 4 Needs Attention 11 22
16 16 c5c61596a3b6bd0cee5766992c48a9a1 b6e99561fe6f34a55b0b7da92f8ed775 7124 guarulhos SP -23.433637 -46.533773 ec28553b12bef4b538e5d7454464a1c6 delivered 2018-06-18 13:34:21 2018-06-19 15:07:00 1.0 credit_card 1.0 16.53 5 1.0 e64e4ef7b809314306d9c09635d8cd30 2018-06-22 13:55:47 7.09 9.44 pet_shop praia grande SP 4.290459 0.0 1.958685 4 3 1 431 8 Champions 3 14
17 17 49d0ea0986edde72da777f15456a0ee0 3e6fd6b2f0d499456a6a6820a40f2d79 68485 pacaja PA -3.815265 -50.618767 ab797176e01c2fa3d13aa1fe3e63d9ee delivered 2017-11-01 21:54:10 2017-11-06 15:43:02 1.0 credit_card 5.0 54.62 4 1.0 a0cb23ea9b98f51407ac7bd1fcd29c2e 2017-11-07 22:06:00 28.99 25.63 Electronics and Tech sao paulo SP 5.707110 0.0 3.366951 2 1 1 211 4 Needs Attention 30 40
19 19 690172ab319622688d3b4df42f676898 a96d5cfa0d3181817e2b946f921ea021 74914 aparecida de goiania GO -16.753520 -49.263289 aaff8afa47c8426e414a6d908a97713c delivered 2017-10-15 11:08:48 2017-10-16 21:36:29 1.0 credit_card 2.0 232.71 1 3.0 368c6c730842d78016ad823897a372db 2017-10-19 11:25:49 59.90 17.67 Home Decor sao jose do rio preto SP 5.765191 0.0 4.092677 2 3 2 232 7 Loyal 10 21
In [87]:
product_cluster0 = df0.groupby('product_category_name').agg({'payment_value': lambda payment_value: payment_value.sum()})

DBSCAN

In [90]:
# Compute DBSCAN
db = DBSCAN(eps = 1, min_samples=5,n_jobs=-1).fit(data_pca)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels2D = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels2D)) - (1 if -1 in labels2D else 0)
n_noise_ = list(labels2D).count(-1)

print('2D Estimated number of clusters: %d' % n_clusters_)
print('2D Estimated number of noise points: %d' % n_noise_)
print("2D Silhouette Coefficient: %0.3f" % metrics.silhouette_score(data_pca, labels2D))
2D Estimated number of clusters: 3
2D Estimated number of noise points: 12
2D Silhouette Coefficient: 0.609
In [91]:
db1 = DBSCAN(eps=0.4, min_samples=3).fit(data_pca)
core_samples_mask = np.zeros_like(db1.labels_, dtype=bool)
core_samples_mask[db1.core_sample_indices_] = True
labels2D2 = db1.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels2D2)) - (1 if -1 in labels2D2 else 0)
n_noise_ = list(labels2D2).count(-1)

print('2D Estimated number of clusters: %d' % n_clusters_)
print('2D Estimated number of noise points: %d' % n_noise_)
print("2D Silhouette Coefficient: %0.3f" % metrics.silhouette_score(data_pca, labels2D2))
2D Estimated number of clusters: 27
2D Estimated number of noise points: 30
2D Silhouette Coefficient: 0.532
In [92]:
db2 = DBSCAN(eps=0.1, min_samples=10).fit(data_pca)
core_samples_mask = np.zeros_like(db2.labels_, dtype=bool)
core_samples_mask[db2.core_sample_indices_] = True
labels2D3 = db2.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels2D3)) - (1 if -1 in labels2D3 else 0)
n_noise_ = list(labels2D3).count(-1)

print('2D Estimated number of clusters: %d' % n_clusters_)
print('2D Estimated number of noise points: %d' % n_noise_)
print("2D Silhouette Coefficient: %0.3f" % metrics.silhouette_score(data_pca, labels2D3))
2D Estimated number of clusters: 64
2D Estimated number of noise points: 2355
2D Silhouette Coefficient: -0.263
In [93]:
# Création d'un dataset avec features et labels
df_dbscan = df.copy()
df_dbscan['labels'] = db.labels_
In [94]:
stat = pd.pivot_table(df_dbscan,
                      values=df_dbscan.columns,
                      index=['labels', 'customer_unique_id'])
for x in df_dbscan['labels'].value_counts().index:
    display(stat.loc[x, :].describe())
RFM_Score customer_zip_code_prefix freight_value frequency geolocation_lat geolocation_lng index monetary nb_days_estimated_to_deliver nb_days_to_deliver order_item_id payment_installments payment_sequential payment_value price recency review_score
count 88112.000000 88112.000000 88112.000000 88112.0 88112.000000 88112.000000 88112.000000 88112.000000 88112.000000 88112.000000 88112.000000 88112.000000 88112.000000 88112.000000 88112.000000 88112.000000 88112.000000
mean 7.422723 35245.216509 20.250875 0.0 -21.173837 -46.183769 46994.798427 4.363169 22.853629 11.643204 1.136735 2.892149 1.022869 158.138225 126.364882 5.156661 4.152476
std 1.929398 29832.078349 15.930192 0.0 5.633083 4.066961 27134.956513 0.922449 8.758360 9.483433 0.522171 2.688016 0.224705 218.298654 190.775620 0.950948 1.285404
min 3.000000 1004.000000 0.000000 0.0 -33.689948 -72.668881 0.000000 0.788457 1.000000 1.000000 1.000000 0.000000 1.000000 0.000000 2.200000 0.000000 1.000000
25% 6.000000 11432.000000 13.370000 0.0 -23.590078 -48.138909 23504.750000 3.737670 17.000000 6.000000 1.000000 1.000000 1.000000 60.230000 42.000000 4.736198 4.000000
50% 7.000000 24457.000000 16.430000 0.0 -22.922759 -46.632826 46975.500000 4.369448 22.000000 9.000000 1.000000 2.000000 1.000000 103.335000 79.000000 5.389072 5.000000
75% 9.000000 59054.000000 21.320000 0.0 -20.080869 -43.608100 70478.250000 4.940928 28.000000 15.000000 1.000000 4.000000 1.000000 175.090000 139.900000 5.849325 5.000000
max 12.000000 99980.000000 409.680000 0.0 42.184003 -8.723762 94016.000000 8.815073 153.000000 208.000000 21.000000 24.000000 17.000000 13664.080000 6735.000000 6.543912 5.000000
RFM_Score customer_zip_code_prefix freight_value frequency geolocation_lat geolocation_lng index monetary nb_days_estimated_to_deliver nb_days_to_deliver order_item_id payment_installments payment_sequential payment_value price recency review_score
count 2478.000000 2478.000000 2478.000000 2.478000e+03 2478.000000 2478.000000 2478.000000 2478.000000 2478.000000 2478.000000 2478.000000 2478.000000 2478.000000 2478.000000 2478.000000 2478.000000 2478.000000
mean 9.953188 33705.077280 19.092647 6.931472e-01 -21.628714 -46.323754 47208.983858 5.069005 23.287732 11.414447 1.206618 3.290759 1.026836 143.173251 107.495355 5.088798 4.190476
std 1.409147 29792.612338 10.797672 3.808834e-14 4.997745 3.874503 19540.066773 0.744927 7.619108 7.135557 0.547211 2.642870 0.202891 147.172071 127.356209 0.936177 1.091420
min 6.000000 1005.000000 0.000000 6.931472e-01 -33.689948 -67.844090 379.000000 2.483239 1.000000 1.000000 1.000000 1.000000 1.000000 5.305000 5.990000 0.000000 1.000000
25% 9.000000 9725.250000 13.200000 6.931472e-01 -23.584576 -48.004578 32757.875000 4.603168 18.500000 6.500000 1.000000 1.000000 1.000000 70.203750 49.900000 4.663439 3.500000
50% 10.000000 22764.000000 16.450000 6.931472e-01 -22.953082 -46.635228 47347.250000 5.067331 23.000000 10.000000 1.000000 2.500000 1.000000 107.432500 79.275000 5.308268 4.500000
75% 11.000000 50895.000000 20.867500 6.931472e-01 -20.649426 -43.917931 61634.250000 5.543858 27.500000 14.500000 1.000000 5.000000 1.000000 169.700000 127.918750 5.762051 5.000000
max 12.000000 99750.000000 107.390000 6.931472e-01 2.811251 -34.823063 93123.000000 8.907612 84.000000 73.000000 9.000000 17.000000 7.000000 3785.815000 3694.000000 6.539586 5.000000
RFM_Score customer_zip_code_prefix freight_value frequency geolocation_lat geolocation_lng index monetary nb_days_estimated_to_deliver nb_days_to_deliver order_item_id payment_installments payment_sequential payment_value price recency review_score
count 167.000000 167.000000 167.000000 1.670000e+02 167.000000 167.000000 167.000000 167.000000 167.000000 167.000000 167.000000 167.000000 167.000000 167.000000 167.000000 167.000000 167.000000
mean 10.305389 33711.249501 18.463154 1.098612e+00 -21.555407 -46.070386 47628.380240 5.441055 24.224551 10.918164 1.281437 3.022954 1.031936 145.044212 100.787126 5.018364 4.443114
std 1.395831 28568.277612 8.975673 2.004412e-15 4.886981 3.682286 15304.143386 0.749708 6.681533 6.003621 0.556062 2.596653 0.160461 122.109983 82.485648 0.952209 0.832283
min 6.000000 2082.000000 5.033333 1.098612e+00 -31.755497 -56.458624 15686.000000 3.030134 10.666667 1.000000 1.000000 1.000000 1.000000 15.133333 6.900000 2.197225 1.000000
25% 9.000000 12285.500000 14.051667 1.098612e+00 -23.549435 -47.855063 36062.000000 4.922418 19.333333 7.333333 1.000000 1.000000 1.000000 63.935000 45.778333 4.595069 4.000000
50% 10.000000 23648.333333 16.680000 1.098612e+00 -22.903996 -46.552179 47231.000000 5.415211 24.000000 9.666667 1.000000 2.000000 1.000000 114.250000 74.333333 5.135798 5.000000
75% 11.000000 48754.000000 20.095000 1.098612e+00 -20.511478 -43.339018 58302.333333 5.960851 27.500000 12.666667 1.333333 4.500000 1.000000 181.913333 129.313333 5.793014 5.000000
max 12.000000 97670.000000 72.903333 1.098612e+00 -2.890238 -34.839611 86023.333333 7.433028 44.000000 42.666667 4.000000 12.000000 2.333333 800.160000 563.640000 6.400257 5.000000
RFM_Score customer_zip_code_prefix freight_value frequency geolocation_lat geolocation_lng index monetary nb_days_estimated_to_deliver nb_days_to_deliver order_item_id payment_installments payment_sequential payment_value price recency review_score
count 7.000000 7.000000 7.000000 7.000000 7.000000 7.000000 7.000000 7.000000 7.000000 7.000000 7.000000 7.000000 7.0 7.000000 7.000000 7.000000 7.000000
mean 9.142857 14623.428571 12.374286 0.412910 -23.009261 -45.931527 35926.285714 2.712851 14.761905 6.142857 1.190476 2.928571 1.0 137.919524 115.492381 2.330899 4.071429
std 1.676163 8098.704894 7.693493 0.532429 0.418763 1.869878 19992.399013 2.832598 7.778685 3.760699 0.377964 3.445148 0.0 255.065655 264.054359 2.379662 1.426785
min 7.000000 5171.000000 0.000000 0.000000 -23.610531 -47.233124 2966.000000 -0.162519 5.333333 2.000000 1.000000 1.000000 1.0 19.080000 0.850000 0.000000 1.000000
25% 8.000000 9869.000000 8.265000 0.000000 -23.275954 -47.137976 29620.000000 -0.162519 8.000000 2.500000 1.000000 1.000000 1.0 24.251667 0.850000 0.346574 4.000000
50% 9.000000 13189.000000 11.330000 0.000000 -22.912452 -46.847494 33849.000000 3.732896 15.000000 7.000000 1.000000 1.000000 1.0 32.230000 20.900000 1.098612 4.500000
75% 10.000000 19273.000000 18.230000 0.895880 -22.778854 -45.011631 43400.500000 4.587640 21.500000 9.000000 1.166667 3.250000 1.0 76.361667 36.048333 4.836250 5.000000
max 12.000000 25720.000000 22.300000 1.098612 -22.432229 -43.140860 68628.000000 6.569341 24.000000 11.000000 2.000000 10.000000 1.0 712.900000 712.900000 4.852030 5.000000
In [95]:
df_dbscan.columns
Out[95]:
Index(['index', 'customer_id', 'customer_unique_id',
       'customer_zip_code_prefix', 'customer_city', 'customer_state',
       'geolocation_lat', 'geolocation_lng', 'order_id', 'order_status',
       'order_purchase_timestamp', 'order_delivered_carrier_date',
       'payment_sequential', 'payment_type', 'payment_installments',
       'payment_value', 'review_score', 'order_item_id', 'product_id',
       'shipping_limit_date', 'price', 'freight_value',
       'product_category_name', 'seller_city', 'seller_state', 'recency',
       'frequency', 'monetary', 'R', 'F', 'M', 'RFM_Segment_Concat',
       'RFM_Score', 'RFM_Level', 'nb_days_to_deliver',
       'nb_days_estimated_to_deliver', 'labels'],
      dtype='object')
In [96]:
df_dbsc = df_dbscan[['payment_type', 'payment_value', 'review_score', 'price','product_category_name', 
       'recency', 'frequency', 'monetary', 'RFM_Level', 'nb_days_to_deliver',
       'nb_days_estimated_to_deliver', 'labels']]
In [97]:
# Représentation graphique des features pour chaque cluster
for col in df_dbsc.columns.to_list():
    plt.figure(figsize=(15, 7))
    sns.boxplot(y=col, x='labels', data=df_dbsc,
                showfliers=False)
    plt.show()
In [98]:
# Analyse des différentes catégories dans les labels
index_tot = [df_dbsc[df_dbsc['labels'] == x].index
             for x in df_dbsc['labels'].value_counts().index]

plt.figure(figsize=(20, 12))
for x in range(len(index_tot)):
    order = df_dbsc.loc[index_tot[x], 'payment_type'].value_counts()
    order_hue = order.index
    plt.subplot(2, len(index_tot)/2, x+1)
    sns.countplot(y=df_dbsc.loc[index_tot[x], 'payment_type'],
                  order=order_hue,
                  palette='Blues_r')
    plt.title(f"Cluster {x}", fontsize=20)

plt.figure(figsize=(20, 12))
for x in range(len(index_tot)):
    order = df_dbsc.loc[index_tot[x], 'product_category_name'].value_counts()
    order_hue = order.index
    plt.subplot(2, len(index_tot)/2, x+1)
    sns.countplot(y=df_dbsc.loc[index_tot[x], 'product_category_name'],
                  order=order_hue,
                  palette='Blues_r')
    plt.title(f"Cluster {x}", fontsize=20)
<ipython-input-98-dba46e0d6c79>:9: MatplotlibDeprecationWarning:

Passing non-integers as three-element position specification is deprecated since 3.3 and will be removed two minor releases later.

<ipython-input-98-dba46e0d6c79>:9: MatplotlibDeprecationWarning:

Passing non-integers as three-element position specification is deprecated since 3.3 and will be removed two minor releases later.

<ipython-input-98-dba46e0d6c79>:9: MatplotlibDeprecationWarning:

Passing non-integers as three-element position specification is deprecated since 3.3 and will be removed two minor releases later.

<ipython-input-98-dba46e0d6c79>:9: MatplotlibDeprecationWarning:

Passing non-integers as three-element position specification is deprecated since 3.3 and will be removed two minor releases later.

<ipython-input-98-dba46e0d6c79>:19: MatplotlibDeprecationWarning:

Passing non-integers as three-element position specification is deprecated since 3.3 and will be removed two minor releases later.

<ipython-input-98-dba46e0d6c79>:19: MatplotlibDeprecationWarning:

Passing non-integers as three-element position specification is deprecated since 3.3 and will be removed two minor releases later.

<ipython-input-98-dba46e0d6c79>:19: MatplotlibDeprecationWarning:

Passing non-integers as three-element position specification is deprecated since 3.3 and will be removed two minor releases later.

<ipython-input-98-dba46e0d6c79>:19: MatplotlibDeprecationWarning:

Passing non-integers as three-element position specification is deprecated since 3.3 and will be removed two minor releases later.

In [99]:
def display_factorial_planes(X_projected, n_comp, pca, axis_ranks, hue_serie=None,
                             xlimit=None, ylimit=None, labels=None,
                             alpha=1, illustrative_var=None):
    """Tracés des projections des individus dans les plans factoriels"""
    import matplotlib
    matplotlib.rcdefaults()
    for d1,d2 in axis_ranks:
        if d2 < n_comp:
 
            # initialisation de la figure       
            fig = plt.figure(figsize=(7, 6))
        
            # affichage des points
            if illustrative_var is None:
                sns.scatterplot(x=X_projected[:, d1], 
                                y=X_projected[:, d2], 
                                hue=hue_serie, 
                                alpha=alpha, 
                                palette='tab10')
            else:
                illustrative_var = np.array(illustrative_var)
                for value in np.unique(illustrative_var):
                    selected = np.where(illustrative_var == value)
                    sns.scatterplot(x=X_projected[selected, d1],
                                    y=X_projected[selected, d2], 
                                    hue=hue_serie, 
                                    alpha=alpha, 
                                    label=value, 
                                    palette='tab10')
                plt.legend()

            # affichage des labels des points
            if labels is not None:
                for i,(x,y) in enumerate(X_projected[:, [d1, d2]]):
                    plt.text(x, y, labels[i],
                              fontsize='14', ha='center',va='center') 
                
            # détermination des limites du graphique
            boundary = np.max(np.abs(X_projected[:, [d1, d2]])) * 1.1
            if (xlimit, ylimit) == (None, None) :
                plt.xlim([-boundary, boundary])
                plt.ylim([-boundary, boundary])
            else:
                plt.xlim([-xlimit, xlimit])
                plt.ylim([-ylimit, ylimit])
        
            # affichage des lignes horizontales et verticales
            plt.plot([-100, 100], [0, 0], color='grey', ls='--')
            plt.plot([0, 0], [-100, 100], color='grey', ls='--')

            # nom des axes, avec le pourcentage d'inertie expliqué
            plt.xlabel('F{} ({}%)'.format(d1+1, round(100*pca.explained_variance_ratio_[d1], 1)))
            plt.ylabel('F{} ({}%)'.format(d2+1, round(100*pca.explained_variance_ratio_[d2], 1)))

            plt.title("Projection des individus (sur F{} et F{})".format(d1+1, d2+1))
            plt.savefig(f"{d1}_{d2}_factorial_plane.png")
            plt.show(block=False)
In [100]:
# Projection des individus
display_factorial_planes(data_pca, 3, pca,
                         [(0, 1), (1, 2)], hue_serie=df_dbsc['labels'])
plt.show()

Stabilité des 2 clustering

In [101]:
from sklearn.base import clone
from sklearn.utils import check_random_state
rng = np.random.RandomState(1)

def cluster_stability(X, est, n_iter=20, random_state=None):
    labels = []
    indices = []
    for i in range(n_iter):
        # draw bootstrap samples, store indices
        sample_indices = rng.randint(0, X.shape[0], X.shape[0])
        indices.append(sample_indices)
        est = clone(est)
        if hasattr(est, "random_state"):
            # randomize estimator if possible
            est.random_state = rng.randint(1e5)
        X_bootstrap = X[sample_indices]
        est.fit(X_bootstrap)
        # store clustering outcome using original indices
        relabel = -np.ones(X.shape[0], dtype=np.int)
        relabel[sample_indices] = est.labels_
        labels.append(relabel)
    scores = []
    for l, i in zip(labels, indices):
        for k, j in zip(labels, indices):
            # we also compute the diagonal which is a bit silly
            in_both = np.intersect1d(i, j)
            scores.append(adjusted_rand_score(l[in_both], k[in_both]))
    return np.mean(scores)
In [102]:
km_stability = []

cluster_range = range(2, 10, 1)
for n_clusters in cluster_range:
    print(n_clusters)
    km = KMeans(n_clusters=n_clusters, n_init=10, init="random")
    km_stability.append(cluster_stability(data_pca, km))
2
3
4
5
6
7
8
9
In [103]:
db_stability = []
n_clusters_db = []
for eps in np.linspace(.2, 1, 10):
    print(eps)
    db_stability.append(cluster_stability(data_pca, DBSCAN(eps=eps)))
    n_clusters_db.append(len(np.unique(DBSCAN(eps=eps).fit(data_pca).labels_)))
0.2
0.2888888888888889
0.37777777777777777
0.4666666666666667
0.5555555555555556
0.6444444444444445
0.7333333333333334
0.8222222222222222
0.9111111111111112
1.0
In [104]:
cluster_range = range(2, 10, 1)

plt.plot(cluster_range, km_stability, label="k-means")
plt.plot(n_clusters_db, db_stability, label="DBSCAN")
for eps, n_clusters, stability in zip(np.linspace(.2, 1, 15), n_clusters_db, db_stability):
    plt.text(n_clusters, stability, "{:.2f}".format(eps))
plt.legend()
plt.xlabel("n_clusters")
plt.ylabel("stability")
Out[104]:
Text(0, 0.5, 'stability')
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: